1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677 |
- """
- 小红书
- """
- import logging
- from time import sleep
- from typing import Optional
- from playwright.sync_api import sync_playwright, Playwright
- from browser import BaseBrowser
- from tools import utils
- from util.lock_util import LockManager
- from util.playwright_util import is_element_present
- from .client import XiaoHongShuClient
- from .rotate_ident import RotateIdent
- lock_manager = LockManager()
- XHS_URL = 'https://www.xiaohongshu.com'
- class XhsBrowser(BaseBrowser):
- def __init__(self, phone: str, playwright=None):
- super().__init__(phone, playwright)
- def __get_name__(self):
- return 'xhs'
- def __init_browser__(self):
- super().__init_browser__()
- self.xhs_client = self.create_xhs_client(None)
- self.rotate_ident = RotateIdent(self.page)
- self.page.goto(XHS_URL)
- self.rotate_ident.handle_rotate()
- def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClient:
- """Create xhs client"""
- utils.logger.info("[XiaoHongShuCrawler.create_xhs_client] Begin create xiaohongshu API client ...")
- cookie_str, cookie_dict = utils.convert_cookies(self.browser.cookies())
- xhs_client_obj = XiaoHongShuClient(
- proxies=httpx_proxy,
- headers={
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
- "Cookie": cookie_str,
- "Origin": "https://www.xiaohongshu.com",
- "Referer": "https://www.xiaohongshu.com",
- "Content-Type": "application/json;charset=UTF-8"
- },
- playwright_page=self.page,
- cookie_dict=cookie_dict,
- )
- return xhs_client_obj
- def login(self):
- with sync_playwright() as playwright:
- self.__init_browser__()
- # 暂时采用手动登录
- self.page.wait_for_timeout(60_000)
- def polish_huitun_note(self, huitun_notes: []):
- """
- 补齐灰豚文章数据
- :param huitun_notes:
- :return:
- """
- self.__init_browser__()
- if not self.xhs_client.pong():
- return huitun_notes
- for huitun_note in huitun_notes:
- try:
- note_id = huitun_note.get('noteId')
- note_info = self.xhs_client.get_note_by_id(note_id=note_id)
- huitun_note['authorInfo'] = note_info.get('user')
- huitun_note['imageList'] = [img.get('url_default') for img in note_info.get('image_list')]
- sleep(1)
- except Exception as e:
- utils.logger.error(f"爬取小红书异常 {e}")
|