__init__.py 3.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. """
  2. 小红书
  3. """
  4. import json
  5. import logging
  6. from time import sleep
  7. from typing import Optional
  8. from playwright.sync_api import sync_playwright, Playwright
  9. from browser import BaseBrowser
  10. from tools import utils
  11. from util.lock_util import LockManager
  12. from util.playwright_util import is_element_present
  13. # from .client import XiaoHongShuClient
  14. from .rotate_ident import RotateIdent
  15. lock_manager = LockManager()
  16. XHS_URL = 'https://www.xiaohongshu.com'
  17. class XhsBrowser(BaseBrowser):
  18. def __init__(self, phone: str, playwright=None):
  19. super().__init__(phone, playwright)
  20. def __get_name__(self):
  21. return 'xhs'
  22. def __init_browser__(self):
  23. super().__init_browser__()
  24. self.rotate_ident = RotateIdent(self.page)
  25. # client存在406和416异常,未解决前暂时不用client
  26. # self.xhs_client = self.create_xhs_client(None)
  27. # self.page.goto('https://www.xiaohongshu.com/explore/66b4c36b000000001e01cf8a',wait_until='domcontentloaded')
  28. self.rotate_ident.handle_rotate()
  29. self.browser.add_cookies([{
  30. 'name': "webId",
  31. 'value': "xxx123", # any value
  32. 'domain': ".xiaohongshu.com",
  33. 'path': "/"
  34. }])
  35. # def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClient:
  36. # """Create xhs client"""
  37. # utils.logger.info("[XiaoHongShuCrawler.create_xhs_client] Begin create xiaohongshu API client ...")
  38. # cookie_str, cookie_dict = utils.convert_cookies(self.browser.cookies())
  39. # xhs_client_obj = XiaoHongShuClient(
  40. # proxies=httpx_proxy,
  41. # headers={
  42. # "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
  43. # "Cookie": cookie_str,
  44. # "Origin": "https://www.xiaohongshu.com",
  45. # "Referer": "https://www.xiaohongshu.com",
  46. # "Content-Type": "application/json;charset=UTF-8"
  47. # },
  48. # playwright_page=self.page,
  49. # rotate_ident=self.rotate_ident,
  50. # cookie_dict=cookie_dict,
  51. # )
  52. # return xhs_client_obj
  53. def login(self):
  54. self.__init_browser__()
  55. # 暂时采用手动登录
  56. self.page.goto(XHS_URL)
  57. self.page.wait_for_timeout(60_000)
  58. self.playwright.stop()
  59. def polish_huitun_note(self, huitun_notes: []):
  60. """
  61. 补齐灰豚文章数据
  62. :param huitun_notes:
  63. :return:
  64. """
  65. self.__init_browser__()
  66. for huitun_note in huitun_notes:
  67. try:
  68. note_id = huitun_note.get('noteId')
  69. # note_info = self.xhs_client.get_note_by_id(note_id=note_id)
  70. note_info = self.get_note(note_id=note_id)
  71. huitun_note['authorInfo'] = note_info.get('user')
  72. if note_info.get('imageList'):
  73. huitun_note['imageList'] = [img.get('urlDefault') for img in note_info.get('imageList')]
  74. self.page.wait_for_timeout(4000)
  75. except Exception as e:
  76. utils.logger.error(f"爬取小红书异常 {e}")
  77. return list(filter(lambda note: note.get('authorInfo') is not None, huitun_notes))
  78. def get_note(self, note_id: str):
  79. # note = self.xhs_client.get_note_by_id(note_id=note_id)
  80. self.page.goto(f'{XHS_URL}/explore/{note_id}', wait_until='domcontentloaded')
  81. self.rotate_ident.handle_rotate()
  82. data = self.page.evaluate('noteId => window.__INITIAL_STATE__ && JSON.stringify(window.__INITIAL_STATE__.note.noteDetailMap[noteId].note)', note_id)
  83. return json.loads(data)