Переглянути джерело

feat: 兼容分享的链接格式,筛选优化请求响应的处理

wuwenyi 4 днів тому
батько
коміт
d4c012fca4
2 змінених файлів з 48 додано та 3 видалено
  1. 6 3
      instagram/__init__.py
  2. 42 0
      util/url_util.py

+ 6 - 3
instagram/__init__.py

@@ -8,6 +8,7 @@ from playwright.sync_api import sync_playwright, Page, Playwright
 import api
 from browser import BaseBrowser
 from util.lock_util import LockManager
+from util.url_util import get_id_by_url
 from instagram.data_handler import *
 
 IG_URL = 'https://www.instagram.com/'
@@ -44,7 +45,8 @@ class InstagramBrowser(BaseBrowser):
         api.assert_not_none(url, 'url不能为空')
         self.result = None
         self.map_result = {}
-        self.id = get_post_id(url)
+        self.id = get_id_by_url(url)
+        api.assert_not_none(self.id, 'cannot get post id from url')
         self.browser.on('response', self.blog_info_handler)
         self.page.goto(url)
         self.page.wait_for_timeout(1000)
@@ -62,11 +64,12 @@ class InstagramBrowser(BaseBrowser):
     def blog_info_handler(self, response):
         if response is None or response.status != 200:
             return
-        if '/info' in response.url:
+        content_type = response.headers.get('content-type', '')
+        if '/info' in response.url and 'application/json' in content_type:
             info = get_blog_by_rsp(response)
             if info is not None:
                 self.result = info
-        elif self.id in response.url:
+        elif self.id in response.url and 'text/html' in content_type:
             logging.info(f'get {self.id} blog response')
             doc = get_blog_by_doc(response)
             if doc is not None:

+ 42 - 0
util/url_util.py

@@ -0,0 +1,42 @@
+import re
+
+import requests
+from urllib.parse import urlparse
+
+
+def get_expanded_url(url: str) -> str:
+    """
+    展开短链接到完整URL
+    """
+    try:
+        response = requests.head(url, allow_redirects=True)
+        return response.url
+    except Exception as e:
+        return url
+
+
+def get_id_by_url(url: str) -> str:
+    """
+    从URL中提取ID
+    
+    Args:
+        url: 输入的URL字符串
+    
+    Returns:
+        从URL路径中提取的最后一个部分作为ID
+    """
+    # 短链接匹配模式(这里假设使用类似的正则表达式)
+    # https://www.instagram.com/share/_nFwInAGM
+    short_link_pattern = re.compile(r'http[s]?://www\.instagram\.com/share')
+
+    # 如果是短链接,先展开
+    if short_link_pattern.search(url):
+        url = get_expanded_url(url)
+
+    # 解析URL并获取路径
+    parsed_url = urlparse(url)
+    path_parts = parsed_url.path.split('/')
+
+    # 返回路径最后一个部分
+    return path_parts[-1]
+