示例网址:https://www.feibzw.com/chapter-54164-43903904/
右键查看网页源代码,发现每次刷新正文内容都会改变,部分字使用字体加密,如"\𢞪"
发现CSS文件也在改变,打开,即得字体文件Base64(如https://www.feibzw.com/style/_Tt61EGku8Z.css)
实现:
def get_css_content(source_code):
pattern = r'<link rel="stylesheet" type="text/css" href="(/style/_.+?\.css)"/>'
match = re.search(pattern, source_code)
if match:
css_url = match.group(1)
full_css_url = f"{base_url}{css_url}"
response = requests.get(full_css_url)
if response.status_code == 200:
return response.text
return None
def save_font_file(css_content):
pattern = r"@font-face\s*{\s*font-family: 'YHFixed';\s*font-style: normal;\s*src: url\(data:font/woff2;base64,(.*?)\) format\('woff2'\);"
match = re.search(pattern, css_content)
if match:
font_base64 = match.group(1)
font_data = base64.b64decode(font_base64)
with open('font.woff2', 'wb') as file:
file.write(font_data)
使用FontForge打开字体文件:
观察每一个打乱的字符,有一个序号和对应的字的Unicode码,Unicode码可以直接转换为正常的字,所以我们只需要一个序号对应一个Unicode码转换即可
def read_woff2(file_path):
font = TTFont(file_path)
cmap = font['cmap'].getBestCmap()
return cmap
def get_unicode_chars(cmap, codes):
result = ''
code = int(codes)
if code in cmap:
glyph_name = cmap[code]
if glyph_name.startswith('uni'):
unicode_hex = glyph_name[3:]
unicode_val = int(unicode_hex, 16)
result = chr(unicode_val)
return result
获取网页源代码:
def fetch_webpage(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
else:
print(f"无法获取网页内容,状态码:{response.status_code}")
return None
except requests.RequestException as e:
print(f"请求网页时发生错误:{e}")
return None
主函数:
base_url = f"https://www.feibzw.com"
webpage_url = f"{base_url}/chapter-54161-43902849/"
source_code = fetch_webpage(webpage_url)
if source_code:
css_content = get_css_content(source_code)
if css_content:
save_font_file(css_content)
processed_text = extract_and_process_text(source_code)
if processed_text:
print(processed_text)
else:
print("无法提取和处理文本。")
else:
print("无法获取CSS文件内容。")
else:
print("无法获取网页内容。")
实现截图: