Merge branch 'master' of https://github.com/jsz14897502/gpt_academic into jsz14897502-master
This commit is contained in:
commit
fb78569335
@ -5,9 +5,10 @@ from toolbox import update_ui
|
|||||||
def get_meta_information(url, chatbot, history):
|
def get_meta_information(url, chatbot, history):
|
||||||
import requests
|
import requests
|
||||||
import arxiv
|
import arxiv
|
||||||
import difflib
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from toolbox import get_conf
|
from toolbox import get_conf
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
import re
|
||||||
proxies, = get_conf('proxies')
|
proxies, = get_conf('proxies')
|
||||||
headers = {
|
headers = {
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
|
||||||
@ -18,8 +19,33 @@ def get_meta_information(url, chatbot, history):
|
|||||||
# 解析网页内容
|
# 解析网页内容
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
||||||
def string_similar(s1, s2):
|
def search_all_version(url):
|
||||||
return difflib.SequenceMatcher(None, s1, s2).quick_ratio()
|
response = requests.get(url, proxies=proxies, headers=headers)
|
||||||
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
for result in soup.select(".gs_ri"):
|
||||||
|
try:
|
||||||
|
url = result.select_one(".gs_rt").a['href']
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
arxiv_id = extract_arxiv_id(url)
|
||||||
|
if not arxiv_id:
|
||||||
|
continue
|
||||||
|
search = arxiv.Search(
|
||||||
|
id_list = [arxiv_id],
|
||||||
|
max_results = 1,
|
||||||
|
sort_by = arxiv.SortCriterion.Relevance,
|
||||||
|
)
|
||||||
|
return search
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_arxiv_id(url):
|
||||||
|
# 返回给定的url解析出的arxiv_id,如url未成功匹配返回None
|
||||||
|
pattern = r'arxiv.org/abs/([^/]+)'
|
||||||
|
match = re.search(pattern, url)
|
||||||
|
if match:
|
||||||
|
return match.group(1)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
profile = []
|
profile = []
|
||||||
# 获取所有文章的标题和作者
|
# 获取所有文章的标题和作者
|
||||||
@ -31,17 +57,14 @@ def get_meta_information(url, chatbot, history):
|
|||||||
except:
|
except:
|
||||||
citation = 'cited by 0'
|
citation = 'cited by 0'
|
||||||
abstract = result.select_one(".gs_rs").text.strip() # 摘要在 .gs_rs 中的文本,需要清除首尾空格
|
abstract = result.select_one(".gs_rs").text.strip() # 摘要在 .gs_rs 中的文本,需要清除首尾空格
|
||||||
search = arxiv.Search(
|
|
||||||
query = title,
|
|
||||||
max_results = 1,
|
|
||||||
sort_by = arxiv.SortCriterion.Relevance,
|
|
||||||
)
|
|
||||||
try:
|
try:
|
||||||
|
other_versions = next((tag['href'] for tag in result.select_one('.gs_flb').select('.gs_nph') if 'cluster' in tag['href']), None) # 获取所有版本的链接
|
||||||
|
search = search_all_version('http://' + urlparse(url).netloc + other_versions)
|
||||||
|
if search:
|
||||||
paper = next(search.results())
|
paper = next(search.results())
|
||||||
if string_similar(title, paper.title) > 0.90: # same paper
|
|
||||||
abstract = paper.summary.replace('\n', ' ')
|
abstract = paper.summary.replace('\n', ' ')
|
||||||
is_paper_in_arxiv = True
|
is_paper_in_arxiv = True
|
||||||
else: # different paper
|
else: # not found
|
||||||
abstract = abstract
|
abstract = abstract
|
||||||
is_paper_in_arxiv = False
|
is_paper_in_arxiv = False
|
||||||
paper = next(search.results())
|
paper = next(search.results())
|
||||||
|
Loading…
x
Reference in New Issue
Block a user