__init__.py 2.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. """
  2. 小红书
  3. """
  4. import logging
  5. from time import sleep
  6. from typing import Optional
  7. from playwright.sync_api import sync_playwright, Playwright
  8. from browser import BaseBrowser
  9. from tools import utils
  10. from util.lock_util import LockManager
  11. from util.playwright_util import is_element_present
  12. from .client import XiaoHongShuClient
  13. from .rotate_ident import RotateIdent
  14. lock_manager = LockManager()
  15. XHS_URL = 'https://www.xiaohongshu.com'
  16. class XhsBrowser(BaseBrowser):
  17. def __init__(self, phone: str, playwright=None):
  18. super().__init__(phone, playwright)
  19. def __get_name__(self):
  20. return 'xhs'
  21. def __init_browser__(self):
  22. super().__init_browser__()
  23. self.xhs_client = self.create_xhs_client(None)
  24. self.rotate_ident = RotateIdent(self.page)
  25. self.page.goto(XHS_URL)
  26. self.rotate_ident.handle_rotate()
  27. def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClient:
  28. """Create xhs client"""
  29. utils.logger.info("[XiaoHongShuCrawler.create_xhs_client] Begin create xiaohongshu API client ...")
  30. cookie_str, cookie_dict = utils.convert_cookies(self.browser.cookies())
  31. xhs_client_obj = XiaoHongShuClient(
  32. proxies=httpx_proxy,
  33. headers={
  34. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
  35. "Cookie": cookie_str,
  36. "Origin": "https://www.xiaohongshu.com",
  37. "Referer": "https://www.xiaohongshu.com",
  38. "Content-Type": "application/json;charset=UTF-8"
  39. },
  40. playwright_page=self.page,
  41. cookie_dict=cookie_dict,
  42. )
  43. return xhs_client_obj
  44. def login(self):
  45. with sync_playwright() as playwright:
  46. self.__init_browser__()
  47. # 暂时采用手动登录
  48. self.page.wait_for_timeout(60_000)
  49. def polish_huitun_note(self, huitun_notes: []):
  50. """
  51. 补齐灰豚文章数据
  52. :param huitun_notes:
  53. :return:
  54. """
  55. self.__init_browser__()
  56. if not self.xhs_client.pong():
  57. return huitun_notes
  58. for huitun_note in huitun_notes:
  59. try:
  60. note_id = huitun_note.get('noteId')
  61. note_info = self.xhs_client.get_note_by_id(note_id=note_id)
  62. huitun_note['authorInfo'] = note_info.get('user')
  63. huitun_note['imageList'] = [img.get('url_default') for img in note_info.get('image_list')]
  64. sleep(1)
  65. except Exception as e:
  66. utils.logger.error(f"爬取小红书异常 {e}")