Browse Source

feat: 兼容另外的ig文章链接格式,修复id获取问题

wuwenyi 1 month ago
parent
commit
eecdc61532
3 changed files with 43 additions and 8 deletions
  1. 6 1
      README.md
  2. 17 3
      instagram/__init__.py
  3. 20 4
      instagram/data_handler.py

+ 6 - 1
README.md

@@ -13,4 +13,9 @@ page = browser.page
 result = browser.__invoke__(lambda_func=browser.search_blog,
                             url='https://www.instagram.com/p/DC_xBDiTHrw/?utm_source=ig_web_copy_link&igsh=MzRlODBiNWFlZA%3D%3D&img_index=5')
 
-```
+```
+
+ig的几种链接格式:
+- https://www.instagram.com/{username}/p/{post_id}
+- https://www.instagram.com/p/{post_id}
+- https://www.instagram.com/suryxintagram/p/C_aJ_02yzaX/?img_index=10

+ 17 - 3
instagram/__init__.py

@@ -15,6 +15,13 @@ IG_URL = 'https://www.instagram.com/'
 lock_manager = LockManager()
 
 
+def get_post_id(url: str) -> str:
+    match = re.search(r'instagram\.com/(?:[^/]+/)?p/([^/?]+)', url)
+    return match.group(1) if match else None
+
+
+
+
 class InstagramBrowser(BaseBrowser):
 
     def __init__(self, account: str, playwright=None):
@@ -37,7 +44,7 @@ class InstagramBrowser(BaseBrowser):
         api.assert_not_none(url, 'url不能为空')
         self.result = None
         self.map_result = {}
-        self.id = url.lstrip('/').split('/')[-1]
+        self.id = get_post_id(url)
         self.browser.on('response', self.blog_info_handler)
         self.page.goto(url)
         self.page.wait_for_timeout(1000)
@@ -55,9 +62,16 @@ class InstagramBrowser(BaseBrowser):
     def blog_info_handler(self, response):
         if response is None or response.status != 200:
             return
-        if self.id in response.url:
+        if '/info' in response.url:
+            info = get_blog_by_rsp(response)
+            if info is not None:
+                self.result = info
+        elif self.id in response.url:
             logging.info(f'get {self.id} blog response')
-            self.result = get_blog_by_doc(response)
+            doc = get_blog_by_doc(response)
+            if doc is not None:
+                self.result = doc
+
 
     def user_info_handler(self, response):
         if response is None or response.status != 200:

+ 20 - 4
instagram/data_handler.py

@@ -16,10 +16,9 @@ user_require_fields = ['pk', 'id', 'username', 'full_name', 'profile_pic_url', '
 
 def get_blog_by_doc(response):
     item = get_blog_json2(response.text())
-    item = {k: v for k, v in item.items() if k in blog_require_fields}
-    item['cover'] = item['image_versions2']['candidates'][0]['url']
-    item['image_versions2'] = None
-    item['user'] = {k: v for k, v in item['user'].items() if k in user_require_fields}
+    if not item:
+        return None
+    item = handle_item(item)
     return item
 
 
@@ -80,3 +79,20 @@ def get_user_by_request(response):
         return user
     else:
         return None
+
+
+def get_blog_by_rsp(response):
+    response_json = response.json()
+    item = response_json['items'][0]
+    if not item:
+        return None
+    item = handle_item(item)
+    return item
+
+
+def handle_item(item):
+    item = {k: v for k, v in item.items() if k in blog_require_fields}
+    item['cover'] = item['image_versions2']['candidates'][0]['url']
+    item['image_versions2'] = None
+    item['user'] = {k: v for k, v in item['user'].items() if k in user_require_fields}
+    return item