crawler_util.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. # -*- coding: utf-8 -*-
  2. # @Author : relakkes@gmail.com
  3. # @Time : 2023/12/2 12:53
  4. # @Desc : 爬虫相关的工具函数
  5. import base64
  6. import random
  7. import re
  8. from io import BytesIO
  9. from typing import Dict, List, Optional, Tuple
  10. import httpx
  11. from PIL import Image, ImageDraw
  12. from playwright.async_api import Cookie, Page
  13. from . import utils
  14. async def find_login_qrcode(page: Page, selector: str) -> str:
  15. """find login qrcode image from target selector"""
  16. try:
  17. elements = await page.wait_for_selector(
  18. selector=selector,
  19. )
  20. login_qrcode_img = str(await elements.get_property("src")) # type: ignore
  21. if "http://" in login_qrcode_img or "https://" in login_qrcode_img:
  22. async with httpx.AsyncClient(follow_redirects=True) as client:
  23. utils.logger.info(f"[find_login_qrcode] get qrcode by url:{login_qrcode_img}")
  24. resp = await client.get(login_qrcode_img, headers={"User-Agent": get_user_agent()})
  25. if resp.status_code == 200:
  26. image_data = resp.content
  27. base64_image = base64.b64encode(image_data).decode('utf-8')
  28. return base64_image
  29. raise Exception(f"fetch login image url failed, response message:{resp.text}")
  30. return login_qrcode_img
  31. except Exception as e:
  32. print(e)
  33. return ""
  34. def show_qrcode(qr_code) -> None: # type: ignore
  35. """parse base64 encode qrcode image and show it"""
  36. if "," in qr_code:
  37. qr_code = qr_code.split(",")[1]
  38. qr_code = base64.b64decode(qr_code)
  39. image = Image.open(BytesIO(qr_code))
  40. # Add a square border around the QR code and display it within the border to improve scanning accuracy.
  41. width, height = image.size
  42. new_image = Image.new('RGB', (width + 20, height + 20), color=(255, 255, 255))
  43. new_image.paste(image, (10, 10))
  44. draw = ImageDraw.Draw(new_image)
  45. draw.rectangle((0, 0, width + 19, height + 19), outline=(0, 0, 0), width=1)
  46. new_image.show()
  47. def get_user_agent() -> str:
  48. ua_list = [
  49. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
  50. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36",
  51. "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
  52. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
  53. "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36",
  54. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36",
  55. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
  56. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.5112.79 Safari/537.36",
  57. "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
  58. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
  59. "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.5060.53 Safari/537.36",
  60. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.4844.84 Safari/537.36",
  61. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36",
  62. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5112.79 Safari/537.36",
  63. "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
  64. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
  65. "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.5060.53 Safari/537.36",
  66. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.4844.84 Safari/537.36",
  67. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
  68. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.5112.79 Safari/537.36"
  69. ]
  70. return random.choice(ua_list)
  71. def get_mobile_user_agent() -> str:
  72. ua_list = [
  73. "Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Mobile/15E148 Safari/604.1",
  74. "Mozilla/5.0 (iPad; CPU OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Mobile/15E148 Safari/604.1",
  75. "Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/114.0.5735.99 Mobile/15E148 Safari/604.1",
  76. "Mozilla/5.0 (iPad; CPU OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/114.0.5735.124 Mobile/15E148 Safari/604.1",
  77. "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36",
  78. "Mozilla/5.0 (Linux; Android 13; SAMSUNG SM-S918B) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/21.0 Chrome/110.0.5481.154 Mobile Safari/537.36",
  79. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 OPR/99.0.0.0",
  80. "Mozilla/5.0 (Linux; Android 10; JNY-LX1; HMSCore 6.11.0.302) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.88 HuaweiBrowser/13.0.5.303 Mobile Safari/537.36"
  81. ]
  82. return random.choice(ua_list)
  83. def convert_cookies(cookies: Optional[List[Cookie]]) -> Tuple[str, Dict]:
  84. if not cookies:
  85. return "", {}
  86. cookies_str = ";".join([f"{cookie.get('name')}={cookie.get('value')}" for cookie in cookies])
  87. cookie_dict = dict()
  88. for cookie in cookies:
  89. cookie_dict[cookie.get('name')] = cookie.get('value')
  90. return cookies_str, cookie_dict
  91. def convert_str_cookie_to_dict(cookie_str: str) -> Dict:
  92. cookie_dict: Dict[str, str] = dict()
  93. if not cookie_str:
  94. return cookie_dict
  95. for cookie in cookie_str.split(";"):
  96. cookie = cookie.strip()
  97. if not cookie:
  98. continue
  99. cookie_list = cookie.split("=")
  100. if len(cookie_list) != 2:
  101. continue
  102. cookie_value = cookie_list[1]
  103. if isinstance(cookie_value, list):
  104. cookie_value = "".join(cookie_value)
  105. cookie_dict[cookie_list[0]] = cookie_value
  106. return cookie_dict
  107. def match_interact_info_count(count_str: str) -> int:
  108. if not count_str:
  109. return 0
  110. match = re.search(r'\d+', count_str)
  111. if match:
  112. number = match.group()
  113. return int(number)
  114. else:
  115. return 0