wuwenyi 2 months ago
commit
525f3e11b4
13 changed files with 673 additions and 0 deletions
  1. 73 0
      .gitignore
  2. 16 0
      README.md
  3. 89 0
      api/__init__.py
  4. 12 0
      api/login.py
  5. 68 0
      api/search.py
  6. 70 0
      app.py
  7. 55 0
      browser/__init__.py
  8. 70 0
      instagram/__init__.py
  9. 82 0
      instagram/data_handler.py
  10. 2 0
      requirements.txt
  11. 6 0
      stealth.min.js
  12. 95 0
      util/json_util.py
  13. 35 0
      util/lock_util.py

+ 73 - 0
.gitignore

@@ -0,0 +1,73 @@
+# ---> Python
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+Lib/
+Scripts/
+logs/
+**/__pycache__/
+
+### IntelliJ IDEA ###
+.idea/
+*.iws
+*.iml
+*.ipr
+
+# 浏览器上下文数据
+.data
+

+ 16 - 0
README.md

@@ -0,0 +1,16 @@
+# py-instagram-robot
+
+python的instagram RPA
+
+```python
+from instagram import InstagramBrowser
+from playwright.sync_api import sync_playwright, Page, Playwright
+
+playwright_obj = sync_playwright().start()
+browser = InstagramBrowser('createx9951')
+browser.__init_browser__(playwright_obj)
+page = browser.page
+result = browser.__invoke__(lambda_func=browser.search_blog,
+                            url='https://www.instagram.com/p/DC_xBDiTHrw/?utm_source=ig_web_copy_link&igsh=MzRlODBiNWFlZA%3D%3D&img_index=5')
+
+```

+ 89 - 0
api/__init__.py

@@ -0,0 +1,89 @@
+"""
+通用api
+"""
+import os
+import json
+import logging
+import time
+
+SUCCESS_RESPONSE = json.dumps({
+    "code": 1,
+    "msg": "请求成功",
+    "success": True,
+}, ensure_ascii=False)
+
+accounts = set()
+directory = "./.data/instagram"
+if not os.path.exists(directory):
+    os.makedirs(directory)
+for entry in os.listdir(directory):
+    # 构建完整的路径
+    full_path = os.path.join(directory, entry)
+    # 检查是否是文件夹
+    if os.path.isdir(full_path):
+        # 如果是文件夹,将文件夹名称添加到集合中
+        accounts.add(entry)
+
+print("已存在的账号:", accounts)
+
+
+def contain_browser(account):
+    return account in accounts
+
+
+def get_idle_account():
+    from instagram import lock_manager
+    while True:
+        for account in accounts:
+            if not lock_manager.is_locked(account):
+                return account
+        time.sleep(1)
+
+
+def add_phone(account):
+    accounts.add(account)
+
+
+class BusinessException(Exception):
+    """
+    自定义业务异常
+    """
+
+    def __init__(self, msg):
+        super().__init__(self)
+        self.msg = msg
+
+
+def raiseError(msg):
+    """ """
+    raise BusinessException(msg)
+
+
+def fail_response(msg: str):
+    """
+    请求失败
+    """
+    return json.dumps({
+        "code": 0,
+        "msg": msg,
+        "success": False,
+    }, ensure_ascii=False)
+
+
+def assert_not_none(data, msg):
+    """
+    断言方法
+    """
+    if data is None:
+        raise BusinessException(msg)
+
+
+def success(data=None):
+    if data is None:
+        return SUCCESS_RESPONSE
+    return json.dumps({
+        "code": 1,
+        "msg": "请求成功",
+        "data": data,
+        "success": True,
+    }, ensure_ascii=False)

+ 12 - 0
api/login.py

@@ -0,0 +1,12 @@
+"""
+登录接口
+"""
+
+from flask import Blueprint
+from flask import request
+
+import api
+import instagram
+
+login_opt = Blueprint('login', __name__)
+

+ 68 - 0
api/search.py

@@ -0,0 +1,68 @@
+"""
+搜索API
+"""
+from flask import Blueprint
+from flask import request
+
+import api
+from instagram import InstagramBrowser
+
+search_opt = Blueprint('search', __name__)
+
+
+
+@search_opt.route('/user', methods=["POST"])
+def search_user():
+    """
+    搜索抖音用户信息
+    :return:
+    """
+    request_body = request.json
+    user_url = request_body.get('url')
+    api.assert_not_none(user_url, '用户链接不能为空')
+    browser = InstagramBrowser(api.get_idle_account())
+    result = browser.search_user(user_url)
+    return api.success(result)
+
+
+@search_opt.route('/batch-users', methods=["POST"])
+def batch_users():
+    """
+    批量搜索用户信息
+    :return:
+    """
+    request_body = request.json
+    user_urls = request_body.get('urls')
+    api.assert_not_none(user_urls, '用户链接不能为空')
+    browser = InstagramBrowser(api.get_idle_account())
+    result = browser.batch_users(user_urls)
+    return api.success(result)
+
+
+@search_opt.route('/blog', methods=["POST"])
+def search_blog():
+    """
+    搜索博文
+    :return:
+    """
+    request_body = request.json
+    blog_url = request_body.get('url')
+    api.assert_not_none(blog_url, '博文链接不能为空')
+    browser = InstagramBrowser(api.get_idle_account())
+    result = browser.__invoke__(browser.search_blog, url=blog_url)
+    return api.success(result)
+
+
+@search_opt.route('/batch-notes', methods=["POST"])
+def batch_notes():
+    """
+    根据关键字搜索抖音笔记、视频
+    :return:
+    """
+    request_body = request.json
+    note_urls = request_body.get('urls')
+    api.assert_not_none(note_urls, '作品链接不能为空')
+    browser = InstagramBrowser(api.get_idle_account())
+    result = browser.batch_notes(note_urls)
+    return api.success(result)
+

+ 70 - 0
app.py

@@ -0,0 +1,70 @@
+import traceback
+
+from flask import Flask, request
+
+from api import *
+from api import login, search
+
+logs_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'logs')
+os.makedirs(logs_folder, exist_ok=True)
+current_folder = os.path.dirname(os.path.abspath(__file__))
+file_handler = logging.FileHandler(filename=f"{current_folder}/logs/app.log", encoding="utf-8")
+
+# 配置日志格式
+formatter = logging.Formatter("%(asctime)s %(levelname)s[%(funcName)s:%(lineno)s]:%(message)s", "%Y-%m-%d %H:%M:%S")
+file_handler.setFormatter(formatter)
+console_handler = logging.StreamHandler()
+
+# 配置日志记录器
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+logger.addHandler(file_handler)
+# 开发的时候打开这行注释,日志会打印在控制台上面
+logger.addHandler(console_handler)
+
+app = Flask(__name__)
+
+app.register_blueprint(login.login_opt, url_prefix="/login")
+app.register_blueprint(search.search_opt, url_prefix="/search")
+
+
+@app.errorhandler(Exception)
+def handle_exception(error: Exception):
+    """
+    全局异常处理
+    """
+    status_code = 500
+    if isinstance(error, BusinessException):
+        status_code = 200
+        response = fail_response(error.msg)
+    else:
+        logging.error(error)
+        traceback.print_exc()
+        response = fail_response(str(error))
+    return response, status_code
+
+
+@app.before_request
+def log_request():
+    """
+    打印请求
+    """
+    logging.info('Request: %s %s', request.method, request.url)
+    logging.info('Request Body: %s', request.get_data(as_text=True))
+
+
+@app.after_request
+def log_response(response):
+    """
+    打印返回
+    """
+    data = response.get_data(as_text=True)
+    if len(data) > 1000:
+        logging.info('Response Body: %s', data[:1000] + "...")
+    else:
+        logging.info('Response Body: %s', data)
+    return response
+
+
+if __name__ == '__main__':
+    app.run(host="0.0.0.0", port=8980, threaded=True)

+ 55 - 0
browser/__init__.py

@@ -0,0 +1,55 @@
+"""
+
+"""
+import platform
+from abc import abstractmethod
+
+import api
+from playwright.sync_api import Playwright, sync_playwright
+
+SPLIT_CHAR = '\\' if platform.system() == 'Windows' else '/'
+
+
+class BaseBrowser:
+    def __init__(self, account: str, playwright=None):
+        api.assert_not_none(account, "账号不能为空")
+        self.account = account
+        self.browser = None
+        self.page = None
+        self.result = None
+        self.list_result = []
+        self.map_result = {}
+        self.has_more = False
+        self.playwright = playwright
+
+    def __init_browser__(self, playwright):
+        if playwright:
+            self.playwright = playwright
+        else:
+            self.playwright = sync_playwright().start()
+        self.browser = self.playwright.chromium.launch_persistent_context(
+            proxy=None,
+            user_data_dir=f'.{SPLIT_CHAR}.data{SPLIT_CHAR}{self.__get_name__()}{SPLIT_CHAR}{self.account}',
+            headless=False,
+            slow_mo=1000,
+            channel="chrome",
+            ignore_https_errors=True,
+            user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) '
+                       'Chrome/126.0.0.0 Safari/537.36',
+            args=[
+                '--disable-blink-features=AutomationControlled',
+                # '--incognito',
+                '--ignore-certificate-errors-spki-list',
+                '--disable-web-security',  # 禁用 Web 安全性,类似于 ChromeOptions 中的 --ignore-certificate-errors-spki-list
+                '--no-sandbox',  # 禁用沙盒模式
+                '--disable-dev-shm-usage',  # 禁用/dev/shm使用
+                '--disable-features=site-per-process',  # 禁用每个站点的进程,类似于 ChromeOptions 中的 --no-sandbox
+                '--ignore-certificate-errors',  # 忽略证书错误
+                '--disable-features=AutomationControlled'  # 禁用与自动化相关的特性
+            ])
+        self.browser.add_init_script(path="./stealth.min.js")
+        self.page = self.browser.pages[0]
+
+    @abstractmethod
+    def __get_name__(self):
+        pass

+ 70 - 0
instagram/__init__.py

@@ -0,0 +1,70 @@
+"""
+
+"""
+import json
+import logging
+from playwright.sync_api import sync_playwright, Page, Playwright
+
+import api
+from browser import BaseBrowser
+from util.lock_util import LockManager
+from instagram.data_handler import *
+
+IG_URL = 'https://www.instagram.com/'
+
+lock_manager = LockManager()
+
+
+class InstagramBrowser(BaseBrowser):
+
+    def __init__(self, account: str, playwright=None):
+        super().__init__(account, playwright)
+        self.id = None
+
+    def __get_name__(self):
+        return 'instagram'
+
+    def __invoke__(self, lambda_func, *args, **kwargs):
+        lock_manager.acquire_lock(self.account)
+        try:
+            with sync_playwright() as playwright:
+                self.__init_browser__(playwright)
+                return lambda_func(*args, **kwargs)
+        finally:
+            lock_manager.release_lock(self.account)
+
+    def search_blog(self, url):
+        api.assert_not_none(url, 'url不能为空')
+        self.result = None
+        self.map_result = {}
+        self.id = url.lstrip('/').split('/')[-1]
+        self.browser.on('response', self.blog_info_handler)
+        self.page.goto(url)
+        self.page.wait_for_timeout(1000)
+        self.browser.on('response', self.user_info_handler)
+        if self.result is not None:
+            # 将鼠标光标放到头像上,获取粉丝等数据信息
+            username = self.result['user'].get('username')
+            head_ele = self.page.locator(f'img[alt="{username}的头像"]')
+            head_ele.nth(0).hover()
+            self.page.wait_for_timeout(1000)
+            if self.map_result.get('author') is not None:
+                self.result['user'].update(self.map_result['author'])
+        return self.result
+
+    def blog_info_handler(self, response):
+        if response is None or response.status != 200:
+            return
+        if self.id in response.url:
+            logging.info(f'get {self.id} blog response')
+            self.result = get_blog_by_doc(response)
+
+    def user_info_handler(self, response):
+        if response is None or response.status != 200:
+            return
+        if '/graphql/query' in response.url:
+            req_params = response.request.post_data_json.get('variables')
+            if req_params is not None:
+                req_body = json.loads(req_params)
+                if 'userID' in req_body:
+                    self.map_result['author'] = get_user_by_request(response)

+ 82 - 0
instagram/data_handler.py

@@ -0,0 +1,82 @@
+"""
+
+"""
+import json
+import re
+import jsonpath
+from util import json_util
+
+blog_require_fields = ['code', 'pk', 'id', 'taken_at', 'image_versions2', 'user', 'media_type', 'carousel_media',
+                       'carousel_media_count',
+                       'like_count', 'comment_count', 'caption', 'caption_is_edited']
+
+user_require_fields = ['pk', 'id', 'username', 'full_name', 'profile_pic_url', 'latest_reel_media', 'follower_count',
+                       'following_count', 'media_count']
+
+
+def get_blog_by_doc(response):
+    item = get_blog_json2(response.text())
+    item = {k: v for k, v in item.items() if k in blog_require_fields}
+    item['cover'] = item['image_versions2']['candidates'][0]['url']
+    item['image_versions2'] = None
+    item['user'] = {k: v for k, v in item['user'].items() if k in user_require_fields}
+    return item
+
+
+def get_blog_json(html_content):
+    # 逐行读取文件
+    inside_items = False  # 标志是否进入 items 部分
+    items_buffer = ""  # 临时保存 JSON 部分
+
+    for line in html_content.splitlines():
+        # 通过简单的规则修复非标准 JSON 格式(可以根据实际情况定制)
+        line = line.strip()  # 去掉多余的空白字符
+
+        if '"xdt_api__v1__media__shortcode__web_info"' in line:
+            items_buffer = '{'
+            inside_items = True  # 发现目标字段
+            continue
+
+        # 处理 items 数组的部分
+        if inside_items:
+            items_buffer += line  # 累积读取多行
+
+            # 如果找到了 JSON 数组的结束
+            if '"items": [' in items_buffer and ']' in items_buffer:
+                try:
+                    # 尝试解析 JSON
+                    data = json.loads(items_buffer)
+                    # 获取第一个 item
+                    return data
+                except json.JSONDecodeError:
+                    continue  # 如果解析失败,继续读取下一行
+    return None  # 如果没有找到匹配项
+
+
+def get_blog_json2(html_content):
+    # 逐行读取文件
+    for line in html_content.splitlines():
+        # 通过简单的规则修复非标准 JSON 格式(可以根据实际情况定制)
+        line = line.strip()  # 去掉多余的空白字符
+
+        if '"xdt_api__v1__media__shortcode__web_info"' in line:
+            script_pattern = re.compile(r'<script\s+type="application/json"[^>]*>(.*?)</script>', re.DOTALL)
+
+            # 查找所有匹配的 <script> 标签内容
+            json_str = re.findall(script_pattern, line)
+            data = json.loads(json_str[0])
+            jsonpath_expr = '$..xdt_api__v1__media__shortcode__web_info.items[0]'  # 寻找第一个 item
+            data = jsonpath.jsonpath(data, jsonpath_expr)
+            if data:
+                return data[0]
+    return None  # 如果没有找到匹配项
+
+
+def get_user_by_request(response):
+    response_json = response.json()
+    if response_json.get('status') == 'ok' and 'data' in response_json:
+        user = response_json['data']['user']
+        user = {k: v for k, v in user.items() if k in user_require_fields}
+        return user
+    else:
+        return None

+ 2 - 0
requirements.txt

@@ -0,0 +1,2 @@
+playwright==1.46.0
+jsonpath-ng==1.7.0

File diff suppressed because it is too large
+ 6 - 0
stealth.min.js


+ 95 - 0
util/json_util.py

@@ -0,0 +1,95 @@
+"""
+自定义的一些 json功能
+"""
+import json
+import re
+
+
+def underscore_to_camelcase(snake_str):
+    """ 将下划线命名转换为驼峰命名 """
+    components = snake_str.split('_')
+    return components[0] + ''.join(x.title() for x in components[1:])
+
+
+def camelcase_to_underscore(camel_str):
+    """将驼峰命名转换为下划线命名"""
+    return re.sub(r'(?<!^)(?=[A-Z])', '_', camel_str).lower()
+
+
+def convert_keys_to_camelcase(obj):
+    """ 递归处理 JSON 对象 """
+    if isinstance(obj, dict):
+        new_obj = {}
+        for k, v in obj.items():
+            new_key = underscore_to_camelcase(k)
+            new_obj[new_key] = convert_keys_to_camelcase(v)
+        return new_obj
+    elif isinstance(obj, list):
+        return [convert_keys_to_camelcase(item) for item in obj]
+    else:
+        return obj
+
+
+def convert_keys_to_underscore(obj):
+    if isinstance(obj, dict):
+        new_obj = {}
+        for k, v in obj.items():
+            new_key = camelcase_to_underscore(k)
+            new_obj[new_key] = convert_keys_to_underscore(v)
+        return new_obj
+    elif isinstance(obj, list):
+        return [convert_keys_to_underscore(item) for item in obj]
+    else:
+        return obj
+
+def get_json_from_content(content:str, key:str):
+    """
+    从不规则的长字符中解析出想要的 json文件
+    :param content:
+    :param key 想要获得的 key
+    :return:
+    """
+    inside_items = False  # 标志是否进入 items 部分
+    items_buffer = ""  # 临时保存 JSON 部分
+
+    for line in content.splitlines():
+        # 通过简单的规则修复非标准 JSON 格式(可以根据实际情况定制)
+        line = line.strip()  # 去掉多余的空白字符
+
+        if f'"${key}"' in line:
+            items_buffer = '{'
+            inside_items = True  # 发现目标字段
+            continue
+
+        # 处理 items 数组的部分
+        if inside_items:
+            items_buffer += line  # 累积读取多行
+
+            # 如果找到了 JSON 数组的结束
+            if '"items": [' in items_buffer and ']' in items_buffer:
+                try:
+                    # 尝试解析 JSON
+                    data = json.loads(items_buffer)
+                    # 获取第一个 item
+                    return data
+                except json.JSONDecodeError:
+                    continue  # 如果解析失败,继续读取下一行
+        return None  # 如果没有找到匹配项
+
+
+def find_key(data, target_key):
+    if isinstance(data, dict):
+        # 如果是字典,递归查找每个键
+        for key, value in data.items():
+            if key == target_key:
+                return value
+            result = find_key(value, target_key)
+            if result:
+                return result
+    elif isinstance(data, list):
+        # 如果是列表,递归查找每个元素
+        for item in data:
+            result = find_key(item, target_key)
+            if result:
+                return result
+    return None

+ 35 - 0
util/lock_util.py

@@ -0,0 +1,35 @@
+"""
+
+"""
+import logging
+import threading
+
+
+class LockManager:
+    """
+    全局锁管理,每个账号只能打开一个上下文相同的浏览器
+    """
+
+    def __init__(self):
+        self.locks = {}
+
+    def acquire_lock(self, key):
+        if key not in self.locks:
+            self.locks[key] = threading.Lock()
+        acquire = self.locks[key].acquire(timeout=300)
+        if acquire:
+            logging.info(f"{key} 获取锁成功")
+
+    def release_lock(self, key):
+        if key in self.locks:
+            self.locks[key].release()
+            logging.info(f"{key} 释放锁成功")
+
+    def is_locked(self, key):
+        """
+        检查给定的键是否处于锁定状态
+        """
+        if key in self.locks:
+            return self.locks[key].locked()
+        else:
+            return False

Some files were not shown because too many files changed in this diff