添加一个全版本搜索的开关

2023-09-06 11:42:29 +08:00 · 2023-09-06 11:42:29 +08:00 · d183e34461
commit d183e34461
parent fb78569335
1 changed files with 61 additions and 40 deletions
--- a/crazy_functions/谷歌检索小助手.py
+++ b/crazy_functions/谷歌检索小助手.py
@ -1,14 +1,18 @@
 from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
-from toolbox import CatchException, report_execption, write_results_to_file
-from toolbox import update_ui
+from toolbox import CatchException, report_execption, promote_file_to_downloadzone
+from toolbox import update_ui, update_ui_lastest_msg, disable_auto_promotion, write_history_to_file
+import logging
+import requests
+
+ENABLE_ALL_VERSION_SEARCH = False

 def get_meta_information(url, chatbot, history):
-    import requests
    import arxiv
+    import difflib
+    import re
    from bs4 import BeautifulSoup
    from toolbox import get_conf
    from urllib.parse import urlparse
-    import re
    proxies, = get_conf('proxies')
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
@ -19,34 +23,39 @@ def get_meta_information(url, chatbot, history):
    # 解析网页内容
    soup = BeautifulSoup(response.text, "html.parser")

-    def search_all_version(url):
-        response = requests.get(url, proxies=proxies, headers=headers)
-        soup = BeautifulSoup(response.text, "html.parser")
-        for result in soup.select(".gs_ri"):
-            try:
-                url = result.select_one(".gs_rt").a['href']
-            except:
-                continue
-            arxiv_id = extract_arxiv_id(url)
-            if not arxiv_id:
-                continue
-            search = arxiv.Search(
-                id_list = [arxiv_id],
-                max_results = 1,
-                sort_by = arxiv.SortCriterion.Relevance,
-            )
-            return search
-        return None
-    
-    def extract_arxiv_id(url):
-        # 返回给定的url解析出的arxiv_id，如url未成功匹配返回None
-        pattern = r'arxiv.org/abs/([^/]+)'
-        match = re.search(pattern, url)
-        if match:
-            return match.group(1)
-        else:
+    def string_similar(s1, s2):
+        return difflib.SequenceMatcher(None, s1, s2).quick_ratio()
+
+    if ENABLE_ALL_VERSION_SEARCH:
+        def search_all_version(url):
+            response = requests.get(url, proxies=proxies, headers=headers)
+            soup = BeautifulSoup(response.text, "html.parser")
+            for result in soup.select(".gs_ri"):
+                try:
+                    url = result.select_one(".gs_rt").a['href']
+                except:
+                    continue
+                arxiv_id = extract_arxiv_id(url)
+                if not arxiv_id:
+                    continue
+                search = arxiv.Search(
+                    id_list = [arxiv_id],
+                    max_results = 1,
+                    sort_by = arxiv.SortCriterion.Relevance,
+                )
+                paper = next(search.results())
+                return paper
            return None
        
+        def extract_arxiv_id(url):
+            # 返回给定的url解析出的arxiv_id，如url未成功匹配返回None
+            pattern = r'arxiv.org/abs/([^/]+)'
+            match = re.search(pattern, url)
+            if match:
+                return match.group(1)
+            else:
+                return None
+        
    profile = []
    # 获取所有文章的标题和作者
    for result in soup.select(".gs_ri"):
@ -57,23 +66,30 @@ def get_meta_information(url, chatbot, history):
        except:
            citation = 'cited by 0'
        abstract = result.select_one(".gs_rs").text.strip()  # 摘要在 .gs_rs 中的文本，需要清除首尾空格
-        try:
+        if ENABLE_ALL_VERSION_SEARCH:
            other_versions = next((tag['href'] for tag in result.select_one('.gs_flb').select('.gs_nph') if 'cluster' in tag['href']), None)  # 获取所有版本的链接
-            search = search_all_version('http://' + urlparse(url).netloc + other_versions)
-            if search:
-                paper = next(search.results())
+            paper = search_all_version('http://' + urlparse(url).netloc + other_versions)
+        else:
+            search = arxiv.Search(
+                query = title,
+                max_results = 1,
+                sort_by = arxiv.SortCriterion.Relevance,
+            )
+            paper = next(search.results())
+        try:
+            if paper and string_similar(title, paper.title) > 0.90: # same paper
                abstract = paper.summary.replace('\n', ' ')
                is_paper_in_arxiv = True
-            else:   # not found
+            else:   # different paper
                abstract = abstract
                is_paper_in_arxiv = False
            paper = next(search.results())
        except:
            abstract = abstract
            is_paper_in_arxiv = False
-        print(title)
-        print(author)
-        print(citation)
+        logging.info('[title]:' + title)
+        logging.info('[author]:' + author)
+        logging.info('[citation]:' + citation)
        profile.append({
            'title':title,
            'author':author,
@ -88,6 +104,7 @@ def get_meta_information(url, chatbot, history):

@CatchException
 def 谷歌检索小助手(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
+    disable_auto_promotion(chatbot=chatbot)
    # 基本信息：功能、贡献者
    chatbot.append([
        "函数插件功能？",
@ -109,6 +126,9 @@ def 谷歌检索小助手(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst
    # 清空历史，以免输入溢出
    history = []
    meta_paper_info_list = yield from get_meta_information(txt, chatbot, history)
+    if len(meta_paper_info_list) == 0:
+        yield from update_ui_lastest_msg(lastmsg='获取文献失败，可能触发了google反爬虫机制。',chatbot=chatbot, history=history, delay=0)
+        return
    batchsize = 5
    for batch in range(math.ceil(len(meta_paper_info_list)/batchsize)):
        if len(meta_paper_info_list[:batchsize]) > 0:
@ -130,6 +150,7 @@ def 谷歌检索小助手(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst
        "已经全部完成，您可以试试让AI写一个Related Works，例如您可以继续输入Write a \"Related Works\" section about \"你搜索的研究领域\" for me."])
    msg = '正常'
    yield from update_ui(chatbot=chatbot, history=history, msg=msg) # 刷新界面
-    res = write_results_to_file(history)
-    chatbot.append(("完成了吗？", res)); 
+    path = write_history_to_file(history)
+    promote_file_to_downloadzone(path, chatbot=chatbot)
+    chatbot.append(("完成了吗？", path)); 
    yield from update_ui(chatbot=chatbot, history=history, msg=msg) # 刷新界面