@@ -10,16 +10,14 @@ import api
from browser import BaseBrowser
from util.lock_util import LockManager
from util.playwright_util import is_element_present
+from urllib.parse import urlparse, parse_qs
HUITUN_URL = 'https://xhs.huitun.com/'
lock_manager = LockManager()
+password_dict = {}
class HuiTunBrowser(BaseBrowser):
- def __init__(self, phone: str, playwright=None):
- super().__init__(phone, playwright)
- self.password = None
def __get_name__(self):
return 'huitun'
@@ -30,7 +28,7 @@ class HuiTunBrowser(BaseBrowser):
- self.password = password
+ password_dict[self.phone] = password
@@ -49,7 +47,7 @@ class HuiTunBrowser(BaseBrowser):
if pwd_login is not None:
- self.page.get_by_placeholder('6-15位数字与字母组合').type(self.password)
+ self.page.get_by_placeholder('6-15位数字与字母组合').type(password_dict.get(self.phone))
self.page.get_by_text('登 录', exact=True).click()
# 验证码登录
captcha_frame = self.page.frames[1]
@@ -112,3 +110,68 @@ class HuiTunBrowser(BaseBrowser):
self.has_more = False
+ def search_note_by_hot_tag(self, size: int):
+ """抓取热词搜索文章"""
+ lock_manager.acquire_lock(self.phone)
+ try:
+ self.__init_browser__()
+ self.list_result = []
+ self.has_more = True
+ self.page.on('response', self.hot_tag_handler)
+ self.page.goto('https://xhs.huitun.com/#/hot/topic_list')
+ self.login_if_need()
+ self.page.wait_for_timeout(3000)
+ api.assert_not_empty(self.list_result,"获取标签失败")
+ self.list_result = self.list_result[:size]
+ topic_map = {}
+ for tag in self.list_result:
+ logging.info(f'搜索标签:{tag.get("topicName")}')
+ topic_id = tag.get("topicId")
+ topic_map[topic_id] = tag
+ self.page.goto(f'https://xhs.huitun.com/#/anchor/topic_detail?id={topic_id}')
+ self.page.reload()
+ self.page.get_by_text('笔记分析',exact=True).click()
+ self.page.on('response', self.hot_tag_note_handler)
+ self.page.get_by_text('近7天', exact=True).click()
+ page_num = 1
+ # 目前版本最多只能翻 50页
+ while self.map_result.get(topic_id) is not None and len(self.map_result.get(topic_id)) < 30 and page_num <= 50:
+ logging.info(f'继续搜索热标签图文,当前文章数量:{len(self.map_result.get(topic_id))}, 页数:{page_num}')
+ self.page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
+ self.page.wait_for_timeout(1000)
+ self.page.query_selector('.ant-pagination-next').click()
+ self.page.wait_for_timeout(1000)
+ page_num += 1
+ return {
+ 'tagList': self.list_result,
+ 'tagNotes': self.map_result
+ }
+ finally:
+ lock_manager.release_lock(self.phone)
+ self.close()
+ def hot_tag_handler(self, response):
+ """处理热词搜索请求响应"""
+ if response is not None and ('/topic/search' in response.url or '/rank/topic/add' in response.url):
+ response_body = response.json()
+ if response_body.get('status') == 0:
+ tag_list = response_body.get('extData').get('list')
+ self.list_result = tag_list[0:19]
+ def hot_tag_note_handler(self, response):
+ """处理热词搜索文章请求响应"""
+ if response is not None and '/topic/detail/notes/' in response.url:
+ response_body = response.json()
+ if response_body.get('status') == 0:
+ parsed_url = urlparse( response.url)
+ query_params = parse_qs(parsed_url.query)
+ topic_id = query_params.get('topicId', [None])[0]
+ note_list = response_body.get('extData').get('list')
+ # 只筛选图文笔记
+ note_list = [note for note in note_list if note.get('type') == 'normal']
+ exist_note_list = self.map_result.get(topic_id)
+ if exist_note_list is None:
+ self.map_result[topic_id] = note_list
+ else:
+ exist_note_list.extend(note_list)