__init__.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. """
  2. 小红书
  3. """
  4. import json
  5. import logging
  6. from time import sleep
  7. from typing import Optional
  8. from playwright.sync_api import sync_playwright, Playwright
  9. from browser import BaseBrowser
  10. from tools import utils
  11. from util.lock_util import LockManager
  12. from util.playwright_util import is_element_present
  13. # from .client import XiaoHongShuClient
  14. from .rotate_ident import RotateIdent
  15. lock_manager = LockManager()
  16. XHS_URL = 'https://www.xiaohongshu.com'
  17. class XhsBrowser(BaseBrowser):
  18. def __init__(self, phone: str, playwright=None):
  19. super().__init__(phone, playwright)
  20. def __get_name__(self):
  21. return 'xhs'
  22. def __init_browser__(self):
  23. super().__init_browser__()
  24. self.rotate_ident = RotateIdent(self.page)
  25. # client存在406和416异常,未解决前暂时不用client
  26. # self.xhs_client = self.create_xhs_client(None)
  27. # self.page.goto('https://www.xiaohongshu.com/explore/66b4c36b000000001e01cf8a',wait_until='domcontentloaded')
  28. self.rotate_ident.handle_rotate()
  29. # 这个 cookie是直接用 client时候用的,如果用 playwright获取数据,不要打开
  30. # self.browser.add_cookies([{
  31. # 'name': "webId",
  32. # 'value': "xxx123", # any value
  33. # 'domain': ".xiaohongshu.com",
  34. # 'path': "/"
  35. # }])
  36. # def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClient:
  37. # """Create xhs client"""
  38. # utils.logger.info("[XiaoHongShuCrawler.create_xhs_client] Begin create xiaohongshu API client ...")
  39. # cookie_str, cookie_dict = utils.convert_cookies(self.browser.cookies())
  40. # xhs_client_obj = XiaoHongShuClient(
  41. # proxies=httpx_proxy,
  42. # headers={
  43. # "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
  44. # "Cookie": cookie_str,
  45. # "Origin": "https://www.xiaohongshu.com",
  46. # "Referer": "https://www.xiaohongshu.com",
  47. # "Content-Type": "application/json;charset=UTF-8"
  48. # },
  49. # playwright_page=self.page,
  50. # rotate_ident=self.rotate_ident,
  51. # cookie_dict=cookie_dict,
  52. # )
  53. # return xhs_client_obj
  54. def login(self):
  55. self.__init_browser__()
  56. # 暂时采用手动登录
  57. self.page.goto(XHS_URL)
  58. self.page.wait_for_timeout(60_000)
  59. self.playwright.stop()
  60. def polish_huitun_note(self, huitun_notes: []):
  61. """
  62. 补齐灰豚文章数据
  63. :param huitun_notes:
  64. :return:
  65. """
  66. self.__init_browser__()
  67. for huitun_note in huitun_notes:
  68. try:
  69. note_id = huitun_note.get('noteId')
  70. # note_info = self.xhs_client.get_note_by_id(note_id=note_id)
  71. note_info = self.get_note(note_id=note_id)
  72. huitun_note['authorInfo'] = note_info.get('user')
  73. if note_info.get('imageList'):
  74. huitun_note['imageList'] = [img.get('urlDefault') for img in note_info.get('imageList')]
  75. self.page.wait_for_timeout(4000)
  76. except Exception as e:
  77. utils.logger.error(f"爬取小红书异常 {e}")
  78. return list(filter(lambda note: note.get('authorInfo') is not None, huitun_notes))
  79. def get_note(self, note_id: str):
  80. # note = self.xhs_client.get_note_by_id(note_id=note_id)
  81. url = f'{XHS_URL}/explore/{note_id}'
  82. self.page.goto(url)
  83. if self.page.url != url:
  84. self.page.locator('.note-item').nth(0).click()
  85. self.page.goto(url)
  86. self.rotate_ident.handle_rotate()
  87. data = self.page.evaluate('noteId => window.__INITIAL_STATE__ && JSON.stringify(window.__INITIAL_STATE__.note.noteDetailMap[noteId].note)', note_id)
  88. return json.loads(data)