wuwenyi
/
py-hutun-robot


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
							"""
小红书
"""
import json
import logging
from time import sleep
from typing import Optional

from playwright.sync_api import sync_playwright, Playwright

from browser import BaseBrowser
from tools import utils
from util.lock_util import LockManager
from util.playwright_util import is_element_present
# from .client import XiaoHongShuClient
from .rotate_ident import RotateIdent

lock_manager = LockManager()
XHS_URL = 'https://www.xiaohongshu.com'


class XhsBrowser(BaseBrowser):

    def __init__(self, phone: str, playwright=None):
        super().__init__(phone, playwright)

    def __get_name__(self):
        return 'xhs'

    def __init_browser__(self):
        super().__init_browser__()
        self.rotate_ident = RotateIdent(self.page)
        # client存在406和416异常，未解决前暂时不用client
        # self.xhs_client = self.create_xhs_client(None)
        # self.page.goto('https://www.xiaohongshu.com/explore/66b4c36b000000001e01cf8a',wait_until='domcontentloaded')
        self.rotate_ident.handle_rotate()
        # 这个 cookie是直接用 client时候用的，如果用 playwright获取数据，不要打开
        # self.browser.add_cookies([{
        #     'name': "webId",
        #     'value': "xxx123",  # any value
        #     'domain': ".xiaohongshu.com",
        #     'path': "/"
        # }])

    # def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClient:
    #     """Create xhs client"""
    #     utils.logger.info("[XiaoHongShuCrawler.create_xhs_client] Begin create xiaohongshu API client ...")
    #     cookie_str, cookie_dict = utils.convert_cookies(self.browser.cookies())
    #     xhs_client_obj = XiaoHongShuClient(
    #         proxies=httpx_proxy,
    #         headers={
    #             "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
    #             "Cookie": cookie_str,
    #             "Origin": "https://www.xiaohongshu.com",
    #             "Referer": "https://www.xiaohongshu.com",
    #             "Content-Type": "application/json;charset=UTF-8"
    #         },
    #         playwright_page=self.page,
    #         rotate_ident=self.rotate_ident,
    #         cookie_dict=cookie_dict,
    #     )
    #     return xhs_client_obj

    def login(self):
        self.__init_browser__()
        # 暂时采用手动登录
        self.page.goto(XHS_URL)
        self.page.wait_for_timeout(60_000)
        self.playwright.stop()

    def polish_huitun_note(self, huitun_notes: []):
        """
        补齐灰豚文章数据
        :param huitun_notes:
        :return:
        """
        self.__init_browser__()
        for huitun_note in huitun_notes:
            try:
                note_id = huitun_note.get('noteId')
                # note_info = self.xhs_client.get_note_by_id(note_id=note_id)
                note_info = self.get_note(note_id=note_id)
                huitun_note['authorInfo'] = note_info.get('user')
                if note_info.get('imageList'):
                    huitun_note['imageList'] = [img.get('urlDefault') for img in note_info.get('imageList')]
                self.page.wait_for_timeout(4000)
            except Exception as e:
                utils.logger.error(f"爬取小红书异常 {e}")
        return list(filter(lambda note: note.get('authorInfo') is not None, huitun_notes))

    def get_note(self, note_id: str):
        # note = self.xhs_client.get_note_by_id(note_id=note_id)
        url = f'{XHS_URL}/explore/{note_id}'
        self.page.goto(url)
        self.rotate_ident.handle_rotate()
        if self.page.url != url:
            self.page.locator('.note-item').nth(0).click()
            self.page.goto(url)
        data = self.page.evaluate('noteId => window.__INITIAL_STATE__ && JSON.stringify(window.__INITIAL_STATE__.note.noteDetailMap[noteId].note)', note_id)
        return json.loads(data)