123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100 |
- """
- 小红书
- """
- import json
- import logging
- from time import sleep
- from typing import Optional
- from playwright.sync_api import sync_playwright, Playwright
- from browser import BaseBrowser
- from tools import utils
- from util.lock_util import LockManager
- from util.playwright_util import is_element_present
- # from .client import XiaoHongShuClient
- from .rotate_ident import RotateIdent
- lock_manager = LockManager()
- XHS_URL = 'https://www.xiaohongshu.com'
- class XhsBrowser(BaseBrowser):
- def __init__(self, phone: str, playwright=None):
- super().__init__(phone, playwright)
- def __get_name__(self):
- return 'xhs'
- def __init_browser__(self):
- super().__init_browser__()
- self.rotate_ident = RotateIdent(self.page)
- # client存在406和416异常,未解决前暂时不用client
- # self.xhs_client = self.create_xhs_client(None)
- # self.page.goto('https://www.xiaohongshu.com/explore/66b4c36b000000001e01cf8a',wait_until='domcontentloaded')
- self.rotate_ident.handle_rotate()
- # 这个 cookie是直接用 client时候用的,如果用 playwright获取数据,不要打开
- # self.browser.add_cookies([{
- # 'name': "webId",
- # 'value': "xxx123", # any value
- # 'domain': ".xiaohongshu.com",
- # 'path': "/"
- # }])
- # def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClient:
- # """Create xhs client"""
- # utils.logger.info("[XiaoHongShuCrawler.create_xhs_client] Begin create xiaohongshu API client ...")
- # cookie_str, cookie_dict = utils.convert_cookies(self.browser.cookies())
- # xhs_client_obj = XiaoHongShuClient(
- # proxies=httpx_proxy,
- # headers={
- # "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
- # "Cookie": cookie_str,
- # "Origin": "https://www.xiaohongshu.com",
- # "Referer": "https://www.xiaohongshu.com",
- # "Content-Type": "application/json;charset=UTF-8"
- # },
- # playwright_page=self.page,
- # rotate_ident=self.rotate_ident,
- # cookie_dict=cookie_dict,
- # )
- # return xhs_client_obj
- def login(self):
- self.__init_browser__()
- # 暂时采用手动登录
- self.page.goto(XHS_URL)
- self.page.wait_for_timeout(60_000)
- self.playwright.stop()
- def polish_huitun_note(self, huitun_notes: []):
- """
- 补齐灰豚文章数据
- :param huitun_notes:
- :return:
- """
- self.__init_browser__()
- for huitun_note in huitun_notes:
- try:
- note_id = huitun_note.get('noteId')
- # note_info = self.xhs_client.get_note_by_id(note_id=note_id)
- note_info = self.get_note(note_id=note_id)
- huitun_note['authorInfo'] = note_info.get('user')
- if note_info.get('imageList'):
- huitun_note['imageList'] = [img.get('urlDefault') for img in note_info.get('imageList')]
- self.page.wait_for_timeout(4000)
- except Exception as e:
- utils.logger.error(f"爬取小红书异常 {e}")
- return list(filter(lambda note: note.get('authorInfo') is not None, huitun_notes))
- def get_note(self, note_id: str):
- # note = self.xhs_client.get_note_by_id(note_id=note_id)
- url = f'{XHS_URL}/explore/{note_id}'
- self.page.goto(url)
- self.rotate_ident.handle_rotate()
- if self.page.url != url:
- self.page.locator('.note-item').nth(0).click()
- self.page.goto(url)
- data = self.page.evaluate('noteId => window.__INITIAL_STATE__ && JSON.stringify(window.__INITIAL_STATE__.note.noteDetailMap[noteId].note)', note_id)
- return json.loads(data)
|