""" 小红书 """ import json import logging from time import sleep from typing import Optional from playwright.sync_api import sync_playwright, Playwright from browser import BaseBrowser from tools import utils from util.lock_util import LockManager from util.playwright_util import is_element_present # from .client import XiaoHongShuClient from .rotate_ident import RotateIdent lock_manager = LockManager() XHS_URL = 'https://www.xiaohongshu.com' class XhsBrowser(BaseBrowser): def __init__(self, phone: str, playwright=None): super().__init__(phone, playwright) def __get_name__(self): return 'xhs' def __init_browser__(self): super().__init_browser__() self.rotate_ident = RotateIdent(self.page) # client存在406和416异常,未解决前暂时不用client # self.xhs_client = self.create_xhs_client(None) # self.page.goto('https://www.xiaohongshu.com/explore/66b4c36b000000001e01cf8a',wait_until='domcontentloaded') self.rotate_ident.handle_rotate() # 这个 cookie是直接用 client时候用的,如果用 playwright获取数据,不要打开 # self.browser.add_cookies([{ # 'name': "webId", # 'value': "xxx123", # any value # 'domain': ".xiaohongshu.com", # 'path': "/" # }]) # def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClient: # """Create xhs client""" # utils.logger.info("[XiaoHongShuCrawler.create_xhs_client] Begin create xiaohongshu API client ...") # cookie_str, cookie_dict = utils.convert_cookies(self.browser.cookies()) # xhs_client_obj = XiaoHongShuClient( # proxies=httpx_proxy, # headers={ # "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36", # "Cookie": cookie_str, # "Origin": "https://www.xiaohongshu.com", # "Referer": "https://www.xiaohongshu.com", # "Content-Type": "application/json;charset=UTF-8" # }, # playwright_page=self.page, # rotate_ident=self.rotate_ident, # cookie_dict=cookie_dict, # ) # return xhs_client_obj def login(self): self.__init_browser__() # 暂时采用手动登录 self.page.goto(XHS_URL) self.page.wait_for_timeout(60_000) self.playwright.stop() def polish_huitun_note(self, huitun_notes: []): """ 补齐灰豚文章数据 :param huitun_notes: :return: """ self.__init_browser__() for huitun_note in huitun_notes: try: note_id = huitun_note.get('noteId') # note_info = self.xhs_client.get_note_by_id(note_id=note_id) note_info = self.get_note(note_id=note_id) huitun_note['authorInfo'] = note_info.get('user') if note_info.get('imageList'): huitun_note['imageList'] = [img.get('urlDefault') for img in note_info.get('imageList')] self.page.wait_for_timeout(4000) except Exception as e: utils.logger.error(f"爬取小红书异常 {e}") return list(filter(lambda note: note.get('authorInfo') is not None, huitun_notes)) def get_note(self, note_id: str): # note = self.xhs_client.get_note_by_id(note_id=note_id) url = f'{XHS_URL}/explore/{note_id}' self.page.goto(url) self.rotate_ident.handle_rotate() if self.page.url != url: self.page.locator('.note-item').nth(0).click() self.page.goto(url) data = self.page.evaluate('noteId => window.__INITIAL_STATE__ && JSON.stringify(window.__INITIAL_STATE__.note.noteDetailMap[noteId].note)', note_id) return json.loads(data)