From e512d99879df882624cf04d57698f50f805c2cf7 Mon Sep 17 00:00:00 2001 From: binary-husky Date: Sat, 9 Sep 2023 18:22:22 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=B8=80=E5=AE=9A=E7=9A=84?= =?UTF-8?q?=E5=BB=B6=E8=BF=9F=EF=BC=8C=E9=98=B2=E6=AD=A2=E8=A7=A6=E5=8F=91?= =?UTF-8?q?=E5=8F=8D=E7=88=AC=E8=99=AB=E6=9C=BA=E5=88=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crazy_functions/谷歌检索小助手.py | 51 +++++++++++++++++++------------ 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/crazy_functions/谷歌检索小助手.py b/crazy_functions/谷歌检索小助手.py index 4c8b71f..05e80d2 100644 --- a/crazy_functions/谷歌检索小助手.py +++ b/crazy_functions/谷歌检索小助手.py @@ -3,8 +3,10 @@ from toolbox import CatchException, report_execption, promote_file_to_downloadzo from toolbox import update_ui, update_ui_lastest_msg, disable_auto_promotion, write_history_to_file import logging import requests +import time +import random -ENABLE_ALL_VERSION_SEARCH = False +ENABLE_ALL_VERSION_SEARCH = True def get_meta_information(url, chatbot, history): import arxiv @@ -36,6 +38,7 @@ def get_meta_information(url, chatbot, history): if ENABLE_ALL_VERSION_SEARCH: def search_all_version(url): + time.sleep(random.randint(1,5)) # 睡一会防止触发google反爬虫 response = session.get(url) soup = BeautifulSoup(response.text, "html.parser") @@ -52,7 +55,8 @@ def get_meta_information(url, chatbot, history): max_results=1, sort_by=arxiv.SortCriterion.Relevance, ) - paper = next(search.results()) + try: paper = next(search.results()) + except: paper = None return paper return None @@ -76,25 +80,32 @@ def get_meta_information(url, chatbot, history): except: citation = 'cited by 0' abstract = result.select_one(".gs_rs").text.strip() # 摘要在 .gs_rs 中的文本,需要清除首尾空格 - if ENABLE_ALL_VERSION_SEARCH: - other_versions = next((tag['href'] for tag in result.select_one('.gs_flb').select('.gs_nph') if 'cluster' in tag['href']), None) # 获取所有版本的链接 - paper = search_all_version('http://' + urlparse(url).netloc + other_versions) + + # 首先在arxiv上搜索,获取文章摘要 + search = arxiv.Search( + query = title, + max_results = 1, + sort_by = arxiv.SortCriterion.Relevance, + ) + try: paper = next(search.results()) + except: paper = None + + is_match = paper is not None and string_similar(title, paper.title) > 0.90 + + # 如果在Arxiv上匹配失败,检索文章的历史版本的题目 + if not is_match and ENABLE_ALL_VERSION_SEARCH: + other_versions_page_url = [tag['href'] for tag in result.select_one('.gs_flb').select('.gs_nph') if 'cluster' in tag['href']] + if len(other_versions_page_url) > 0: + other_versions_page_url = other_versions_page_url[0] + paper = search_all_version('http://' + urlparse(url).netloc + other_versions_page_url) + is_match = paper is not None and string_similar(title, paper.title) > 0.90 + + if is_match: + # same paper + abstract = paper.summary.replace('\n', ' ') + is_paper_in_arxiv = True else: - search = arxiv.Search( - query = title, - max_results = 1, - sort_by = arxiv.SortCriterion.Relevance, - ) - paper = next(search.results()) - try: - if paper and string_similar(title, paper.title) > 0.90: # same paper - abstract = paper.summary.replace('\n', ' ') - is_paper_in_arxiv = True - else: # different paper - abstract = abstract - is_paper_in_arxiv = False - paper = next(search.results()) - except: + # different paper abstract = abstract is_paper_in_arxiv = False