__init__.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. """
  2. """
  3. import logging
  4. from huitun.captcha_ident import CaptchaIdent
  5. from playwright.sync_api import sync_playwright, Page, Playwright
  6. import api
  7. from browser import BaseBrowser
  8. from util.lock_util import LockManager
  9. from util.playwright_util import is_element_present
  10. from urllib.parse import urlparse, parse_qs
  11. HUITUN_URL = 'https://xhs.huitun.com/'
  12. lock_manager = LockManager()
  13. password_dict = {}
  14. class HuiTunBrowser(BaseBrowser):
  15. def __get_name__(self):
  16. return 'huitun'
  17. def login(self, password: str):
  18. """
  19. 登录抖音,一个登录之后,全部的页面都有了登录状态
  20. :return: 2- 需要验证码 1-登录成功
  21. """
  22. self.__init_browser__()
  23. self.page.goto(HUITUN_URL)
  24. password_dict[self.phone] = password
  25. self.login_if_need()
  26. self.page.wait_for_timeout(30_000)
  27. self.close()
  28. def login_if_need(self):
  29. """
  30. 登录灰豚
  31. """
  32. login_info_expired = self.page.query_selector('.ant-btn-primary:has-text("知道了")')
  33. if login_info_expired is not None:
  34. login_info_expired.click()
  35. if is_element_present(self.page, '.ant-modal-body'):
  36. logging.info('灰豚需要重新登录')
  37. if not is_element_present(self.page, 'text=密码登录'):
  38. pwd_login = self.page.query_selector('.b9dOaTo9gfF3wLAi7jlXTg\=\=')
  39. if pwd_login is not None:
  40. pwd_login.click()
  41. self.page.get_by_placeholder('请输入手机号').type(self.phone)
  42. self.page.get_by_placeholder('6-15位数字与字母组合').type(password_dict.get(self.phone))
  43. self.page.get_by_text('登 录', exact=True).click()
  44. # 验证码登录
  45. captcha_frame = self.page.frames[1]
  46. if captcha_frame is not None:
  47. captcha_tool = CaptchaIdent(self.page)
  48. captcha_tool.start()
  49. def search_note(self, tag_name: str, size: int):
  50. lock_manager.acquire_lock(self.phone)
  51. try:
  52. self.__init_browser__()
  53. self.list_result = []
  54. self.has_more = True
  55. api.assert_not_none(tag_name, "标签不能为空")
  56. self.page.goto('https://xhs.huitun.com/#/note/note_search')
  57. self.page.wait_for_timeout(2000)
  58. self.login_if_need()
  59. # 展开全部标签
  60. self.page.query_selector('.zgInWFcVVDjRN6BUMm3N0g\=\=').click()
  61. last_tag = self.page.query_selector('.fyBvQcyA81sogVJY0YVnhg\=\=')
  62. if last_tag is not None:
  63. last_tag.click()
  64. tag_ele = self.page.query_selector(f'.IRk6XOEYweiS9APLHrOp-w\=\=:has-text("{tag_name}")')
  65. if tag_ele is not None:
  66. tag_ele.click()
  67. self.page.get_by_text('图文笔记', exact=True).click()
  68. self.page.wait_for_timeout(500)
  69. self.page.on('response', self.search_note_handler)
  70. self.page.get_by_text('近3天', exact=True).click()
  71. # 限定一个上限
  72. page_num = int(2 * size / 10)
  73. for i in range(page_num):
  74. if size is not None and len(self.list_result) >= size:
  75. break
  76. logging.info('继续搜索灰豚')
  77. self.page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
  78. self.page.wait_for_timeout(2000)
  79. logging.info('搜索灰豚结果数:%s', len(self.list_result))
  80. if not self.has_more:
  81. break
  82. return self.list_result
  83. finally:
  84. lock_manager.release_lock(self.phone)
  85. self.close()
  86. def search_note_handler(self, response):
  87. """
  88. 处理用户主页搜索图文请求响应
  89. :param response:
  90. :return:
  91. """
  92. if response is not None and '/note/search' in response.url:
  93. response_body = response.json()
  94. if response_body.get('status') == 0:
  95. note_list = response_body.get('extData').get('list')
  96. self.has_more = len(note_list) > 0
  97. if len(self.list_result) == 0:
  98. self.list_result = note_list
  99. else:
  100. self.list_result.extend(note_list)
  101. else:
  102. self.has_more = False
  103. def search_note_by_hot_tag(self, size: int):
  104. """抓取热词搜索文章"""
  105. lock_manager.acquire_lock(self.phone)
  106. try:
  107. self.__init_browser__()
  108. self.list_result = []
  109. self.has_more = True
  110. self.page.on('response', self.hot_tag_handler)
  111. self.page.goto('https://xhs.huitun.com/#/hot/topic_list')
  112. self.login_if_need()
  113. self.page.wait_for_timeout(3000)
  114. api.assert_not_empty(self.list_result,"获取标签失败")
  115. self.list_result = self.list_result[:size]
  116. topic_map = {}
  117. for tag in self.list_result:
  118. logging.info(f'搜索标签:{tag.get("topicName")}')
  119. topic_id = tag.get("topicId")
  120. topic_map[topic_id] = tag
  121. self.page.goto(f'https://xhs.huitun.com/#/anchor/topic_detail?id={topic_id}')
  122. self.page.reload()
  123. self.page.get_by_text('笔记分析',exact=True).click()
  124. self.page.on('response', self.hot_tag_note_handler)
  125. self.page.get_by_text('近7天', exact=True).click()
  126. page_num = 1
  127. # 目前版本最多只能翻 50页
  128. while self.map_result.get(topic_id) is not None and len(self.map_result.get(topic_id)) < 30 and page_num <= 50:
  129. logging.info(f'继续搜索热标签图文,当前文章数量:{len(self.map_result.get(topic_id))}, 页数:{page_num}')
  130. self.page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
  131. self.page.wait_for_timeout(1000)
  132. self.page.query_selector('.ant-pagination-next').click()
  133. self.page.wait_for_timeout(1000)
  134. page_num += 1
  135. return {
  136. 'tagList': self.list_result,
  137. 'tagNotes': self.map_result
  138. }
  139. finally:
  140. lock_manager.release_lock(self.phone)
  141. self.close()
  142. def hot_tag_handler(self, response):
  143. """处理热词搜索请求响应"""
  144. if response is not None and ('/topic/search' in response.url or '/rank/topic/add' in response.url):
  145. response_body = response.json()
  146. if response_body.get('status') == 0:
  147. tag_list = response_body.get('extData').get('list')
  148. self.list_result = tag_list[0:19]
  149. def hot_tag_note_handler(self, response):
  150. """处理热词搜索文章请求响应"""
  151. if response is not None and '/topic/detail/notes/' in response.url:
  152. response_body = response.json()
  153. if response_body.get('status') == 0:
  154. parsed_url = urlparse( response.url)
  155. query_params = parse_qs(parsed_url.query)
  156. topic_id = query_params.get('topicId', [None])[0]
  157. note_list = response_body.get('extData').get('list')
  158. # 只筛选图文笔记
  159. note_list = [note for note in note_list if note.get('type') == 'normal']
  160. exist_note_list = self.map_result.get(topic_id)
  161. if exist_note_list is None:
  162. self.map_result[topic_id] = note_list
  163. else:
  164. exist_note_list.extend(note_list)