Pārlūkot izejas kodu

feat: 灰豚rpa init

wuwenyi 7 mēneši atpakaļ
revīzija
cc6b93534f
9 mainītis faili ar 431 papildinājumiem un 0 dzēšanām
  1. 73 0
      .gitignore
  2. 3 0
      README.md
  3. 89 0
      api/__init__.py
  4. 23 0
      api/login.py
  5. 21 0
      api/search.py
  6. 72 0
      app.py
  7. 143 0
      huitun/__init__.py
  8. 1 0
      requirements.txt
  9. 6 0
      stealth.min.js

+ 73 - 0
.gitignore

@@ -0,0 +1,73 @@
+# ---> Python
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+Lib/
+Scripts/
+logs/
+**/__pycache__/
+
+### IntelliJ IDEA ###
+.idea/
+*.iws
+*.iml
+*.ipr
+
+# 浏览器上下文数据
+.data
+

+ 3 - 0
README.md

@@ -0,0 +1,3 @@
+# py-huitun-robot
+
+灰豚数据rpa

+ 89 - 0
api/__init__.py

@@ -0,0 +1,89 @@
+"""
+通用api
+"""
+import os
+import json
+import logging
+import time
+
+SUCCESS_RESPONSE = json.dumps({
+    "code": 1,
+    "msg": "请求成功",
+    "success": True,
+}, ensure_ascii=False)
+
+phones = set()
+directory = "./.data/huitun"
+if not os.path.exists(directory):
+    os.makedirs(directory)
+for entry in os.listdir(directory):
+    # 构建完整的路径
+    full_path = os.path.join(directory, entry)
+    # 检查是否是文件夹
+    if os.path.isdir(full_path):
+        # 如果是文件夹,将文件夹名称添加到集合中
+        phones.add(entry)
+
+print("已存在的账号:", phones)
+
+
+def contain_browser(phone):
+    return phone in phones
+
+
+def get_idle_phone():
+    from huitun import lock_manager
+    while True:
+        for phone in phones:
+            if not lock_manager.is_locked(phone):
+                return phone
+        time.sleep(1)
+
+
+def add_phone(phone):
+    phones.add(phone)
+
+
+class BusinessException(Exception):
+    """
+    自定义业务异常
+    """
+
+    def __init__(self, msg):
+        super().__init__(self)
+        self.msg = msg
+
+
+def raiseError(msg):
+    """ """
+    raise BusinessException(msg)
+
+
+def fail_response(msg: str):
+    """
+    请求失败
+    """
+    return json.dumps({
+        "code": 0,
+        "msg": msg,
+        "success": False,
+    }, ensure_ascii=False)
+
+
+def assert_not_none(data, msg):
+    """
+    断言方法
+    """
+    if data is None:
+        raise BusinessException(msg)
+
+
+def success(data=None):
+    if data is None:
+        return SUCCESS_RESPONSE
+    return json.dumps({
+        "code": 1,
+        "msg": "请求成功",
+        "data": data,
+        "success": True,
+    }, ensure_ascii=False)

+ 23 - 0
api/login.py

@@ -0,0 +1,23 @@
+"""
+登录接口
+"""
+
+from flask import Blueprint
+from flask import request
+
+import api
+import huitun
+
+login_opt = Blueprint('login', __name__)
+
+@login_opt.route('/login', methods=["POST"])
+def login():
+    """
+    登录接口
+    :return: 1-登录成功 2-需要验证码
+    """
+    request_body = request.json
+    phone = request_body.get('phone')
+    browser = huitun.HuiTunBrowser(phone)
+    login_result = browser.login(request_body.get('password'))
+    return api.success(login_result)

+ 21 - 0
api/search.py

@@ -0,0 +1,21 @@
+"""
+搜索API
+"""
+from flask import Blueprint
+from flask import request
+
+import api
+from huitun import HuiTunBrowser
+
+search_opt = Blueprint('search', __name__)
+
+@search_opt.route('/note', methods=["POST"])
+def search_note():
+    """
+    根据关键字搜索抖音笔记
+    :return:
+    """
+    request_body = request.json
+    browser = HuiTunBrowser(api.get_idle_phone())
+    result = browser.search_note(request_body.get('tagName'), request_body.get('searchLimit'))
+    return api.success(result)

+ 72 - 0
app.py

@@ -0,0 +1,72 @@
+import logging
+import os
+import traceback
+
+from flask import Flask, request
+
+from api import *
+from api import login, search
+
+logs_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'logs')
+os.makedirs(logs_folder, exist_ok=True)
+current_folder = os.path.dirname(os.path.abspath(__file__))
+file_handler = logging.FileHandler(filename=f"{current_folder}/logs/app.log", encoding="utf-8")
+
+# 配置日志格式
+formatter = logging.Formatter("%(asctime)s %(levelname)s[%(funcName)s:%(lineno)s]:%(message)s", "%Y-%m-%d %H:%M:%S")
+file_handler.setFormatter(formatter)
+console_handler = logging.StreamHandler()
+
+# 配置日志记录器
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+logger.addHandler(file_handler)
+# 开发的时候打开这行注释,日志会打印在控制台上面
+logger.addHandler(console_handler)
+
+app = Flask(__name__)
+
+app.register_blueprint(login.login_opt, url_prefix="/login")
+app.register_blueprint(search.search_opt, url_prefix="/search")
+
+
+@app.errorhandler(Exception)
+def handle_exception(error: Exception):
+    """
+    全局异常处理
+    """
+    status_code = 500
+    if isinstance(error, BusinessException):
+        status_code = 200
+        response = fail_response(error.msg)
+    else:
+        logging.error(error)
+        traceback.print_exc()
+        response = fail_response(str(error))
+    return response, status_code
+
+
+@app.before_request
+def log_request():
+    """
+    打印请求
+    """
+    logging.info('Request: %s %s', request.method, request.url)
+    logging.info('Request Body: %s', request.get_data(as_text=True))
+
+
+@app.after_request
+def log_response(response):
+    """
+    打印返回
+    """
+    data = response.get_data(as_text=True)
+    if len(data) > 1000:
+        logging.info('Response Body: %s', data[:1000] + "...")
+    else:
+        logging.info('Response Body: %s', data)
+    return response
+
+
+if __name__ == '__main__':
+    app.run(host="0.0.0.0", port=8978, threaded=True)

+ 143 - 0
huitun/__init__.py

@@ -0,0 +1,143 @@
+"""
+
+"""
+import logging
+import threading
+
+from playwright.sync_api import sync_playwright, Page, Playwright
+
+import api
+
+HUITUN_URL = 'https://xhs.huitun.com/'
+
+
+def is_element_present(page, selector):
+    try:
+        page.wait_for_selector(selector, timeout=2000)
+        return True
+    except Exception:
+        return False
+
+
+class LockManager():
+    """
+    全局锁管理,每个手机号只能打开一个上下文相同的浏览器
+    """
+
+    def __init__(self):
+        self.locks = {}
+
+    def acquire_lock(self, key):
+        if key not in self.locks:
+            self.locks[key] = threading.Lock()
+        acquire = self.locks[key].acquire(timeout=300)
+        if acquire:
+            logging.info(f"{key} 获取锁成功")
+
+    def release_lock(self, key):
+        if key in self.locks:
+            self.locks[key].release()
+            logging.info(f"{key} 释放锁成功")
+
+    def is_locked(self, key):
+        """
+        检查给定的键是否处于锁定状态
+        """
+        if key in self.locks:
+            return self.locks[key].locked()
+        else:
+            return False
+
+
+lock_manager = LockManager()
+
+
+class HuiTunBrowser:
+    def __init__(self, phone: str):
+        api.assert_not_none(phone, "手机号不能为空")
+        self.phone = phone
+        self.browser = None
+        self.page = None
+        self.result = None
+        self.list_result = []
+        self.has_more = False
+
+    def __init_browser__(self, playwright: Playwright):
+        self.browser = playwright.chromium.launch_persistent_context(
+            user_data_dir=f'./.data/huitun/{self.phone}',
+            headless=False,
+            slow_mo=1000,
+            channel="chrome",
+            ignore_https_errors=True,
+            args=[
+                '--disable-blink-features=AutomationControlled',
+                '--incognito',
+                '--ignore-certificate-errors-spki-list',
+                '--disable-web-security',  # 禁用 Web 安全性,类似于 ChromeOptions 中的 --ignore-certificate-errors-spki-list
+                '--no-sandbox',  # 禁用沙盒模式
+                '--disable-dev-shm-usage',  # 禁用/dev/shm使用
+                '--disable-features=site-per-process',  # 禁用每个站点的进程,类似于 ChromeOptions 中的 --no-sandbox
+                '--ignore-certificate-errors',  # 忽略证书错误
+                '--disable-features=AutomationControlled'  # 禁用与自动化相关的特性
+            ])
+        self.browser.add_init_script(path="./stealth.min.js")
+        self.page = self.browser.new_page()
+
+    def close(self):
+        if self.browser is not None:
+            self.browser.close()
+        if self.page is not None:
+            self.page.close()
+
+    def login(self, password: str):
+        """
+        登录抖音,一个登录之后,全部的页面都有了登录状态
+        :return: 2- 需要验证码 1-登录成功
+        """
+        with sync_playwright() as playwright:
+            self.__init_browser__(playwright)
+            self.page.goto(HUITUN_URL)
+            if is_element_present(self.page, '.ant-modal-body'):
+                if not is_element_present(self.page, 'text=密码登录'):
+                    pwd_login = self.page.query_selector('.b9dOaTo9gfF3wLAi7jlXTg\=\=')
+                    if pwd_login is not None:
+                        pwd_login.click()
+                self.page.get_by_placeholder('请输入手机号').type(self.phone)
+                self.page.get_by_placeholder('6-15位数字与字母组合').type(password)
+                self.page.get_by_text('登 录', exact=True).click()
+                self.page.wait_for_timeout(30_000)
+
+    def search_note(self, tag_name: str, size: int):
+        lock_manager.acquire_lock(self.phone)
+        try:
+            with sync_playwright() as playwright:
+                self.__init_browser__(playwright)
+                self.list_result = []
+                api.assert_not_none(tag_name, "标签不能为空")
+                self.page.on('response', self.search_note_handler)
+                self.page.goto('https://xhs.huitun.com/#/note/note_search')
+                self.page.wait_for_timeout(3000)
+                while size is None or len(self.list_result) < size:
+                    logging.info('继续搜索用户主页')
+                    self.page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
+                    self.page.wait_for_timeout(2000)
+                    logging.info('搜索用户主页图文结果数:%s', len(self.list_result))
+                self.close()
+                return self.list_result
+        finally:
+            lock_manager.release_lock(self.phone)
+
+    def search_note_handler(self, response):
+        """
+        处理用户主页搜索图文请求响应
+        :param response:
+        :return:
+        """
+        if response is not None and '/note/search' in response.url:
+            response_body = response.json()
+            if response_body.get('status') == 0:
+                note_list = response_body.get('extData').get('list')
+                if len(self.list_result) == 0:
+                    self.list_result = note_list
+                else:
+                    self.list_result.extend(note_list)

+ 1 - 0
requirements.txt

@@ -0,0 +1 @@
+playwright==1.43.0

Failā izmaiņas netiks attēlotas, jo tās ir par lielu
+ 6 - 0
stealth.min.js


Daži faili netika attēloti, jo izmaiņu fails ir pārāk liels