__init__.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. """
  2. """
  3. import logging
  4. import threading
  5. from playwright.sync_api import sync_playwright, Page, Playwright
  6. import api
  7. HUITUN_URL = 'https://xhs.huitun.com/'
  8. def is_element_present(page, selector):
  9. try:
  10. page.wait_for_selector(selector, timeout=2000)
  11. return True
  12. except Exception:
  13. return False
  14. class LockManager():
  15. """
  16. 全局锁管理,每个手机号只能打开一个上下文相同的浏览器
  17. """
  18. def __init__(self):
  19. self.locks = {}
  20. def acquire_lock(self, key):
  21. if key not in self.locks:
  22. self.locks[key] = threading.Lock()
  23. acquire = self.locks[key].acquire(timeout=300)
  24. if acquire:
  25. logging.info(f"{key} 获取锁成功")
  26. def release_lock(self, key):
  27. if key in self.locks:
  28. self.locks[key].release()
  29. logging.info(f"{key} 释放锁成功")
  30. def is_locked(self, key):
  31. """
  32. 检查给定的键是否处于锁定状态
  33. """
  34. if key in self.locks:
  35. return self.locks[key].locked()
  36. else:
  37. return False
  38. lock_manager = LockManager()
  39. class HuiTunBrowser:
  40. def __init__(self, phone: str):
  41. api.assert_not_none(phone, "手机号不能为空")
  42. self.phone = phone
  43. self.browser = None
  44. self.page = None
  45. self.result = None
  46. self.list_result = []
  47. self.has_more = False
  48. def __init_browser__(self, playwright: Playwright):
  49. self.browser = playwright.chromium.launch_persistent_context(
  50. user_data_dir=f'./.data/huitun/{self.phone}',
  51. headless=False,
  52. slow_mo=1000,
  53. channel="chrome",
  54. ignore_https_errors=True,
  55. args=[
  56. '--disable-blink-features=AutomationControlled',
  57. '--incognito',
  58. '--ignore-certificate-errors-spki-list',
  59. '--disable-web-security', # 禁用 Web 安全性,类似于 ChromeOptions 中的 --ignore-certificate-errors-spki-list
  60. '--no-sandbox', # 禁用沙盒模式
  61. '--disable-dev-shm-usage', # 禁用/dev/shm使用
  62. '--disable-features=site-per-process', # 禁用每个站点的进程,类似于 ChromeOptions 中的 --no-sandbox
  63. '--ignore-certificate-errors', # 忽略证书错误
  64. '--disable-features=AutomationControlled' # 禁用与自动化相关的特性
  65. ])
  66. self.browser.add_init_script(path="./stealth.min.js")
  67. self.page = self.browser.new_page()
  68. def close(self):
  69. if self.browser is not None:
  70. self.browser.close()
  71. if self.page is not None:
  72. self.page.close()
  73. def login(self, password: str):
  74. """
  75. 登录抖音,一个登录之后,全部的页面都有了登录状态
  76. :return: 2- 需要验证码 1-登录成功
  77. """
  78. with sync_playwright() as playwright:
  79. self.__init_browser__(playwright)
  80. self.page.goto(HUITUN_URL)
  81. if is_element_present(self.page, '.ant-modal-body'):
  82. if not is_element_present(self.page, 'text=密码登录'):
  83. pwd_login = self.page.query_selector('.b9dOaTo9gfF3wLAi7jlXTg\=\=')
  84. if pwd_login is not None:
  85. pwd_login.click()
  86. self.page.get_by_placeholder('请输入手机号').type(self.phone)
  87. self.page.get_by_placeholder('6-15位数字与字母组合').type(password)
  88. self.page.get_by_text('登 录', exact=True).click()
  89. self.page.wait_for_timeout(30_000)
  90. def search_note(self, tag_name: str, size: int):
  91. lock_manager.acquire_lock(self.phone)
  92. try:
  93. with sync_playwright() as playwright:
  94. self.__init_browser__(playwright)
  95. self.list_result = []
  96. api.assert_not_none(tag_name, "标签不能为空")
  97. self.page.on('response', self.search_note_handler)
  98. self.page.goto('https://xhs.huitun.com/#/note/note_search')
  99. self.page.wait_for_timeout(3000)
  100. while size is None or len(self.list_result) < size:
  101. logging.info('继续搜索用户主页')
  102. self.page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
  103. self.page.wait_for_timeout(2000)
  104. logging.info('搜索用户主页图文结果数:%s', len(self.list_result))
  105. self.close()
  106. return self.list_result
  107. finally:
  108. lock_manager.release_lock(self.phone)
  109. def search_note_handler(self, response):
  110. """
  111. 处理用户主页搜索图文请求响应
  112. :param response:
  113. :return:
  114. """
  115. if response is not None and '/note/search' in response.url:
  116. response_body = response.json()
  117. if response_body.get('status') == 0:
  118. note_list = response_body.get('extData').get('list')
  119. if len(self.list_result) == 0:
  120. self.list_result = note_list
  121. else:
  122. self.list_result.extend(note_list)