From d052d425afeb38098ef0aeab793ec76f4ff4d757 Mon Sep 17 00:00:00 2001 From: jsz14 Date: Wed, 30 Aug 2023 19:14:01 +0800 Subject: [PATCH 1/5] =?UTF-8?q?=E6=9B=B4=E6=94=B9=E8=B0=B7=E6=AD=8C?= =?UTF-8?q?=E5=AD=A6=E6=9C=AF=E6=90=9C=E7=B4=A2=E5=8A=A9=E6=89=8B=E8=8E=B7?= =?UTF-8?q?=E5=8F=96=E6=91=98=E8=A6=81=E7=9A=84=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crazy_functions/谷歌检索小助手.py | 47 +++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/crazy_functions/谷歌检索小助手.py b/crazy_functions/谷歌检索小助手.py index 46c10de..55cebab 100644 --- a/crazy_functions/谷歌检索小助手.py +++ b/crazy_functions/谷歌检索小助手.py @@ -5,9 +5,10 @@ from toolbox import update_ui def get_meta_information(url, chatbot, history): import requests import arxiv - import difflib from bs4 import BeautifulSoup from toolbox import get_conf + from urllib.parse import urlparse + import re proxies, = get_conf('proxies') headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36', @@ -18,9 +19,34 @@ def get_meta_information(url, chatbot, history): # 解析网页内容 soup = BeautifulSoup(response.text, "html.parser") - def string_similar(s1, s2): - return difflib.SequenceMatcher(None, s1, s2).quick_ratio() - + def search_all_version(url): + response = requests.get(url, proxies=proxies, headers=headers) + soup = BeautifulSoup(response.text, "html.parser") + for result in soup.select(".gs_ri"): + try: + url = result.select_one(".gs_rt").a['href'] + except: + continue + arxiv_id = extract_arxiv_id(url) + if not arxiv_id: + continue + search = arxiv.Search( + id_list = [arxiv_id], + max_results = 1, + sort_by = arxiv.SortCriterion.Relevance, + ) + return search + return None + + def extract_arxiv_id(url): + # 返回给定的url解析出的arxiv_id,如url未成功匹配返回None + pattern = r'arxiv.org/abs/([^/]+)' + match = re.search(pattern, url) + if match: + return match.group(1) + else: + return None + profile = [] # 获取所有文章的标题和作者 for result in soup.select(".gs_ri"): @@ -31,17 +57,14 @@ def get_meta_information(url, chatbot, history): except: citation = 'cited by 0' abstract = result.select_one(".gs_rs").text.strip() # 摘要在 .gs_rs 中的文本,需要清除首尾空格 - search = arxiv.Search( - query = title, - max_results = 1, - sort_by = arxiv.SortCriterion.Relevance, - ) + other_versions = next((tag['href'] for tag in result.select_one('.gs_flb').select('.gs_nph') if 'cluster' in tag['href']), None) # 获取所有版本的链接 + search = search_all_version('http://' + urlparse(url).netloc + other_versions) try: - paper = next(search.results()) - if string_similar(title, paper.title) > 0.90: # same paper + if search: + paper = next(search.results()) abstract = paper.summary.replace('\n', ' ') is_paper_in_arxiv = True - else: # different paper + else: # not found abstract = abstract is_paper_in_arxiv = False paper = next(search.results()) From 03164bcb6f7831c4d384d705cce0c37e8d860ab0 Mon Sep 17 00:00:00 2001 From: jsz14 Date: Sat, 2 Sep 2023 19:58:24 +0800 Subject: [PATCH 2/5] =?UTF-8?q?fix:=E6=B2=A1=E6=9C=89=E8=8E=B7=E5=8F=96?= =?UTF-8?q?=E5=88=B0=E6=89=80=E6=9C=89=E7=89=88=E6=9C=AC=E6=97=B6=E7=9A=84?= =?UTF-8?q?=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crazy_functions/谷歌检索小助手.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crazy_functions/谷歌检索小助手.py b/crazy_functions/谷歌检索小助手.py index 55cebab..d8222c2 100644 --- a/crazy_functions/谷歌检索小助手.py +++ b/crazy_functions/谷歌检索小助手.py @@ -57,9 +57,9 @@ def get_meta_information(url, chatbot, history): except: citation = 'cited by 0' abstract = result.select_one(".gs_rs").text.strip() # 摘要在 .gs_rs 中的文本,需要清除首尾空格 - other_versions = next((tag['href'] for tag in result.select_one('.gs_flb').select('.gs_nph') if 'cluster' in tag['href']), None) # 获取所有版本的链接 - search = search_all_version('http://' + urlparse(url).netloc + other_versions) try: + other_versions = next((tag['href'] for tag in result.select_one('.gs_flb').select('.gs_nph') if 'cluster' in tag['href']), None) # 获取所有版本的链接 + search = search_all_version('http://' + urlparse(url).netloc + other_versions) if search: paper = next(search.results()) abstract = paper.summary.replace('\n', ' ') From d183e34461a30637d3027f94b0fc48710ed2dcf3 Mon Sep 17 00:00:00 2001 From: binary-husky Date: Wed, 6 Sep 2023 11:42:29 +0800 Subject: [PATCH 3/5] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=B8=80=E4=B8=AA?= =?UTF-8?q?=E5=85=A8=E7=89=88=E6=9C=AC=E6=90=9C=E7=B4=A2=E7=9A=84=E5=BC=80?= =?UTF-8?q?=E5=85=B3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crazy_functions/谷歌检索小助手.py | 101 ++++++++++++++++++------------ 1 file changed, 61 insertions(+), 40 deletions(-) diff --git a/crazy_functions/谷歌检索小助手.py b/crazy_functions/谷歌检索小助手.py index d8222c2..d392588 100644 --- a/crazy_functions/谷歌检索小助手.py +++ b/crazy_functions/谷歌检索小助手.py @@ -1,14 +1,18 @@ from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive -from toolbox import CatchException, report_execption, write_results_to_file -from toolbox import update_ui +from toolbox import CatchException, report_execption, promote_file_to_downloadzone +from toolbox import update_ui, update_ui_lastest_msg, disable_auto_promotion, write_history_to_file +import logging +import requests + +ENABLE_ALL_VERSION_SEARCH = False def get_meta_information(url, chatbot, history): - import requests import arxiv + import difflib + import re from bs4 import BeautifulSoup from toolbox import get_conf from urllib.parse import urlparse - import re proxies, = get_conf('proxies') headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36', @@ -19,34 +23,39 @@ def get_meta_information(url, chatbot, history): # 解析网页内容 soup = BeautifulSoup(response.text, "html.parser") - def search_all_version(url): - response = requests.get(url, proxies=proxies, headers=headers) - soup = BeautifulSoup(response.text, "html.parser") - for result in soup.select(".gs_ri"): - try: - url = result.select_one(".gs_rt").a['href'] - except: - continue - arxiv_id = extract_arxiv_id(url) - if not arxiv_id: - continue - search = arxiv.Search( - id_list = [arxiv_id], - max_results = 1, - sort_by = arxiv.SortCriterion.Relevance, - ) - return search - return None - - def extract_arxiv_id(url): - # 返回给定的url解析出的arxiv_id,如url未成功匹配返回None - pattern = r'arxiv.org/abs/([^/]+)' - match = re.search(pattern, url) - if match: - return match.group(1) - else: + def string_similar(s1, s2): + return difflib.SequenceMatcher(None, s1, s2).quick_ratio() + + if ENABLE_ALL_VERSION_SEARCH: + def search_all_version(url): + response = requests.get(url, proxies=proxies, headers=headers) + soup = BeautifulSoup(response.text, "html.parser") + for result in soup.select(".gs_ri"): + try: + url = result.select_one(".gs_rt").a['href'] + except: + continue + arxiv_id = extract_arxiv_id(url) + if not arxiv_id: + continue + search = arxiv.Search( + id_list = [arxiv_id], + max_results = 1, + sort_by = arxiv.SortCriterion.Relevance, + ) + paper = next(search.results()) + return paper return None + def extract_arxiv_id(url): + # 返回给定的url解析出的arxiv_id,如url未成功匹配返回None + pattern = r'arxiv.org/abs/([^/]+)' + match = re.search(pattern, url) + if match: + return match.group(1) + else: + return None + profile = [] # 获取所有文章的标题和作者 for result in soup.select(".gs_ri"): @@ -57,23 +66,30 @@ def get_meta_information(url, chatbot, history): except: citation = 'cited by 0' abstract = result.select_one(".gs_rs").text.strip() # 摘要在 .gs_rs 中的文本,需要清除首尾空格 - try: + if ENABLE_ALL_VERSION_SEARCH: other_versions = next((tag['href'] for tag in result.select_one('.gs_flb').select('.gs_nph') if 'cluster' in tag['href']), None) # 获取所有版本的链接 - search = search_all_version('http://' + urlparse(url).netloc + other_versions) - if search: - paper = next(search.results()) + paper = search_all_version('http://' + urlparse(url).netloc + other_versions) + else: + search = arxiv.Search( + query = title, + max_results = 1, + sort_by = arxiv.SortCriterion.Relevance, + ) + paper = next(search.results()) + try: + if paper and string_similar(title, paper.title) > 0.90: # same paper abstract = paper.summary.replace('\n', ' ') is_paper_in_arxiv = True - else: # not found + else: # different paper abstract = abstract is_paper_in_arxiv = False paper = next(search.results()) except: abstract = abstract is_paper_in_arxiv = False - print(title) - print(author) - print(citation) + logging.info('[title]:' + title) + logging.info('[author]:' + author) + logging.info('[citation]:' + citation) profile.append({ 'title':title, 'author':author, @@ -88,6 +104,7 @@ def get_meta_information(url, chatbot, history): @CatchException def 谷歌检索小助手(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): + disable_auto_promotion(chatbot=chatbot) # 基本信息:功能、贡献者 chatbot.append([ "函数插件功能?", @@ -109,6 +126,9 @@ def 谷歌检索小助手(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst # 清空历史,以免输入溢出 history = [] meta_paper_info_list = yield from get_meta_information(txt, chatbot, history) + if len(meta_paper_info_list) == 0: + yield from update_ui_lastest_msg(lastmsg='获取文献失败,可能触发了google反爬虫机制。',chatbot=chatbot, history=history, delay=0) + return batchsize = 5 for batch in range(math.ceil(len(meta_paper_info_list)/batchsize)): if len(meta_paper_info_list[:batchsize]) > 0: @@ -130,6 +150,7 @@ def 谷歌检索小助手(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst "已经全部完成,您可以试试让AI写一个Related Works,例如您可以继续输入Write a \"Related Works\" section about \"你搜索的研究领域\" for me."]) msg = '正常' yield from update_ui(chatbot=chatbot, history=history, msg=msg) # 刷新界面 - res = write_results_to_file(history) - chatbot.append(("完成了吗?", res)); + path = write_history_to_file(history) + promote_file_to_downloadzone(path, chatbot=chatbot) + chatbot.append(("完成了吗?", path)); yield from update_ui(chatbot=chatbot, history=history, msg=msg) # 刷新界面 From 2c039ff5c9e49567e9629657ff717122a0fa829b Mon Sep 17 00:00:00 2001 From: binary-husky Date: Wed, 6 Sep 2023 22:19:32 +0800 Subject: [PATCH 4/5] add session --- crazy_functions/谷歌检索小助手.py | 40 ++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/crazy_functions/谷歌检索小助手.py b/crazy_functions/谷歌检索小助手.py index d392588..4c8b71f 100644 --- a/crazy_functions/谷歌检索小助手.py +++ b/crazy_functions/谷歌检索小助手.py @@ -13,13 +13,21 @@ def get_meta_information(url, chatbot, history): from bs4 import BeautifulSoup from toolbox import get_conf from urllib.parse import urlparse + session = requests.session() + proxies, = get_conf('proxies') headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7', + 'Cache-Control':'max-age=0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'Connection': 'keep-alive' } - # 发送 GET 请求 - response = requests.get(url, proxies=proxies, headers=headers) + session.proxies.update(proxies) + session.headers.update(headers) + response = session.get(url) # 解析网页内容 soup = BeautifulSoup(response.text, "html.parser") @@ -28,8 +36,9 @@ def get_meta_information(url, chatbot, history): if ENABLE_ALL_VERSION_SEARCH: def search_all_version(url): - response = requests.get(url, proxies=proxies, headers=headers) + response = session.get(url) soup = BeautifulSoup(response.text, "html.parser") + for result in soup.select(".gs_ri"): try: url = result.select_one(".gs_rt").a['href'] @@ -39,14 +48,15 @@ def get_meta_information(url, chatbot, history): if not arxiv_id: continue search = arxiv.Search( - id_list = [arxiv_id], - max_results = 1, - sort_by = arxiv.SortCriterion.Relevance, + id_list=[arxiv_id], + max_results=1, + sort_by=arxiv.SortCriterion.Relevance, ) paper = next(search.results()) return paper + return None - + def extract_arxiv_id(url): # 返回给定的url解析出的arxiv_id,如url未成功匹配返回None pattern = r'arxiv.org/abs/([^/]+)' @@ -55,7 +65,7 @@ def get_meta_information(url, chatbot, history): return match.group(1) else: return None - + profile = [] # 获取所有文章的标题和作者 for result in soup.select(".gs_ri"): @@ -87,15 +97,17 @@ def get_meta_information(url, chatbot, history): except: abstract = abstract is_paper_in_arxiv = False + logging.info('[title]:' + title) logging.info('[author]:' + author) logging.info('[citation]:' + citation) + profile.append({ - 'title':title, - 'author':author, - 'citation':citation, - 'abstract':abstract, - 'is_paper_in_arxiv':is_paper_in_arxiv, + 'title': title, + 'author': author, + 'citation': citation, + 'abstract': abstract, + 'is_paper_in_arxiv': is_paper_in_arxiv, }) chatbot[-1] = [chatbot[-1][0], title + f'\n\n是否在arxiv中(不在arxiv中无法获取完整摘要):{is_paper_in_arxiv}\n\n' + abstract] From e512d99879df882624cf04d57698f50f805c2cf7 Mon Sep 17 00:00:00 2001 From: binary-husky Date: Sat, 9 Sep 2023 18:22:22 +0800 Subject: [PATCH 5/5] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=B8=80=E5=AE=9A?= =?UTF-8?q?=E7=9A=84=E5=BB=B6=E8=BF=9F=EF=BC=8C=E9=98=B2=E6=AD=A2=E8=A7=A6?= =?UTF-8?q?=E5=8F=91=E5=8F=8D=E7=88=AC=E8=99=AB=E6=9C=BA=E5=88=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crazy_functions/谷歌检索小助手.py | 51 +++++++++++++++++++------------ 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/crazy_functions/谷歌检索小助手.py b/crazy_functions/谷歌检索小助手.py index 4c8b71f..05e80d2 100644 --- a/crazy_functions/谷歌检索小助手.py +++ b/crazy_functions/谷歌检索小助手.py @@ -3,8 +3,10 @@ from toolbox import CatchException, report_execption, promote_file_to_downloadzo from toolbox import update_ui, update_ui_lastest_msg, disable_auto_promotion, write_history_to_file import logging import requests +import time +import random -ENABLE_ALL_VERSION_SEARCH = False +ENABLE_ALL_VERSION_SEARCH = True def get_meta_information(url, chatbot, history): import arxiv @@ -36,6 +38,7 @@ def get_meta_information(url, chatbot, history): if ENABLE_ALL_VERSION_SEARCH: def search_all_version(url): + time.sleep(random.randint(1,5)) # 睡一会防止触发google反爬虫 response = session.get(url) soup = BeautifulSoup(response.text, "html.parser") @@ -52,7 +55,8 @@ def get_meta_information(url, chatbot, history): max_results=1, sort_by=arxiv.SortCriterion.Relevance, ) - paper = next(search.results()) + try: paper = next(search.results()) + except: paper = None return paper return None @@ -76,25 +80,32 @@ def get_meta_information(url, chatbot, history): except: citation = 'cited by 0' abstract = result.select_one(".gs_rs").text.strip() # 摘要在 .gs_rs 中的文本,需要清除首尾空格 - if ENABLE_ALL_VERSION_SEARCH: - other_versions = next((tag['href'] for tag in result.select_one('.gs_flb').select('.gs_nph') if 'cluster' in tag['href']), None) # 获取所有版本的链接 - paper = search_all_version('http://' + urlparse(url).netloc + other_versions) + + # 首先在arxiv上搜索,获取文章摘要 + search = arxiv.Search( + query = title, + max_results = 1, + sort_by = arxiv.SortCriterion.Relevance, + ) + try: paper = next(search.results()) + except: paper = None + + is_match = paper is not None and string_similar(title, paper.title) > 0.90 + + # 如果在Arxiv上匹配失败,检索文章的历史版本的题目 + if not is_match and ENABLE_ALL_VERSION_SEARCH: + other_versions_page_url = [tag['href'] for tag in result.select_one('.gs_flb').select('.gs_nph') if 'cluster' in tag['href']] + if len(other_versions_page_url) > 0: + other_versions_page_url = other_versions_page_url[0] + paper = search_all_version('http://' + urlparse(url).netloc + other_versions_page_url) + is_match = paper is not None and string_similar(title, paper.title) > 0.90 + + if is_match: + # same paper + abstract = paper.summary.replace('\n', ' ') + is_paper_in_arxiv = True else: - search = arxiv.Search( - query = title, - max_results = 1, - sort_by = arxiv.SortCriterion.Relevance, - ) - paper = next(search.results()) - try: - if paper and string_similar(title, paper.title) > 0.90: # same paper - abstract = paper.summary.replace('\n', ' ') - is_paper_in_arxiv = True - else: # different paper - abstract = abstract - is_paper_in_arxiv = False - paper = next(search.results()) - except: + # different paper abstract = abstract is_paper_in_arxiv = False