网址:
aHR0cHM6Ly9jaGFubmVsLmNoaW5hcXcuY29tL3UvcXcvcXN0ei5zaHRtbD9wYWdlcj0w
抓包分析
发现返回的数据是Unicode编码格式,因此提取编码后,还要进行字符串转换。
代码展现:
import requests
import re
import time
import parsel
for page in range(100):
time.sleep(1)
link= f'https://channel.chinaqw.com/u/qw/qstz.shtml?pager={page}'
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}
response = requests.get(url=link,headers=headers).text
# 用正则提取所有的数据
data = re.findall('var docArr=\[(.*?)]',response)[0]
# 用正则提取标题和文章链接
title_list = re.findall('"title":"(.*?)"',data)
url_list = re.findall('"url":"(.*?)"',data)
for title,url in zip(title_list,url_list):
# 对url进行处理
url = url.replace('\','')
# 将unicode编码转为中文字符串
decoded_str = bytes(title, 'utf-8').decode('unicode_escape')
print(decoded_str,url)
response = requests.get(url=url,headers=headers)
response.encoding=response.apparent_encoding
selector = parsel.Selector(response.text)
# 用css选择器提取文章正文内容
content = selector.css('div.editor_content p::text').getall()
# 针对不同的格式,选择不同的 提取内容
if len(content) == 0:
content = selector.css('div.left_zw p::text').getall()
print(content)
Unicode编码转换代码
原始显示的是
用python提取之后是这样
title = '\u4E0A\u6D77\u4FA8\u5546\u4FA8\u9886\u4FA8\u9752\u8D74\u6D59\u6C5F\u6E56\u5DDE\u5F00\u5C55\u5408\u4F5C\u5BF9\u63A5'
coded_str = bytes(title, 'utf-8').decode('unicode_escape')