From e512d99879df882624cf04d57698f50f805c2cf7 Mon Sep 17 00:00:00 2001
From: binary-husky <qingxu.fu@outlook.com>
Date: Sat, 9 Sep 2023 18:22:22 +0800
Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=B8=80=E5=AE=9A=E7=9A=84?=
 =?UTF-8?q?=E5=BB=B6=E8=BF=9F=EF=BC=8C=E9=98=B2=E6=AD=A2=E8=A7=A6=E5=8F=91?=
 =?UTF-8?q?=E5=8F=8D=E7=88=AC=E8=99=AB=E6=9C=BA=E5=88=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 crazy_functions/谷歌检索小助手.py | 51 +++++++++++++++++++------------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/crazy_functions/谷歌检索小助手.py b/crazy_functions/谷歌检索小助手.py
index 4c8b71f..05e80d2 100644
--- a/crazy_functions/谷歌检索小助手.py
+++ b/crazy_functions/谷歌检索小助手.py
@@ -3,8 +3,10 @@ from toolbox import CatchException, report_execption, promote_file_to_downloadzo
 from toolbox import update_ui, update_ui_lastest_msg, disable_auto_promotion, write_history_to_file
 import logging
 import requests
+import time
+import random
 
-ENABLE_ALL_VERSION_SEARCH = False
+ENABLE_ALL_VERSION_SEARCH = True
 
 def get_meta_information(url, chatbot, history):
     import arxiv
@@ -36,6 +38,7 @@ def get_meta_information(url, chatbot, history):
 
     if ENABLE_ALL_VERSION_SEARCH:
         def search_all_version(url):
+            time.sleep(random.randint(1,5)) # 睡一会防止触发google反爬虫
             response = session.get(url)
             soup = BeautifulSoup(response.text, "html.parser")
 
@@ -52,7 +55,8 @@ def get_meta_information(url, chatbot, history):
                     max_results=1,
                     sort_by=arxiv.SortCriterion.Relevance,
                 )
-                paper = next(search.results())
+                try: paper = next(search.results())
+                except: paper = None
                 return paper
 
             return None
@@ -76,25 +80,32 @@ def get_meta_information(url, chatbot, history):
         except:
             citation = 'cited by 0'
         abstract = result.select_one(".gs_rs").text.strip()  # 摘要在 .gs_rs 中的文本，需要清除首尾空格
-        if ENABLE_ALL_VERSION_SEARCH:
-            other_versions = next((tag['href'] for tag in result.select_one('.gs_flb').select('.gs_nph') if 'cluster' in tag['href']), None)  # 获取所有版本的链接
-            paper = search_all_version('http://' + urlparse(url).netloc + other_versions)
+
+        # 首先在arxiv上搜索，获取文章摘要
+        search = arxiv.Search(
+            query = title,
+            max_results = 1,
+            sort_by = arxiv.SortCriterion.Relevance,
+        )
+        try: paper = next(search.results())
+        except: paper = None
+        
+        is_match = paper is not None and string_similar(title, paper.title) > 0.90
+
+        # 如果在Arxiv上匹配失败，检索文章的历史版本的题目
+        if not is_match and ENABLE_ALL_VERSION_SEARCH:
+            other_versions_page_url = [tag['href'] for tag in result.select_one('.gs_flb').select('.gs_nph') if 'cluster' in tag['href']]
+            if len(other_versions_page_url) > 0:
+                other_versions_page_url = other_versions_page_url[0]
+                paper = search_all_version('http://' + urlparse(url).netloc + other_versions_page_url)
+                is_match = paper is not None and string_similar(title, paper.title) > 0.90
+
+        if is_match:
+            # same paper
+            abstract = paper.summary.replace('\n', ' ')
+            is_paper_in_arxiv = True
         else:
-            search = arxiv.Search(
-                query = title,
-                max_results = 1,
-                sort_by = arxiv.SortCriterion.Relevance,
-            )
-            paper = next(search.results())
-        try:
-            if paper and string_similar(title, paper.title) > 0.90: # same paper
-                abstract = paper.summary.replace('\n', ' ')
-                is_paper_in_arxiv = True
-            else:   # different paper
-                abstract = abstract
-                is_paper_in_arxiv = False
-            paper = next(search.results())
-        except:
+            # different paper
             abstract = abstract
             is_paper_in_arxiv = False