更改谷歌学术搜索助手获取摘要的逻辑
This commit is contained in:
		
							parent
							
								
									eb802ee975
								
							
						
					
					
						commit
						d052d425af
					
				@ -5,9 +5,10 @@ from toolbox import update_ui
 | 
				
			|||||||
def get_meta_information(url, chatbot, history):
 | 
					def get_meta_information(url, chatbot, history):
 | 
				
			||||||
    import requests
 | 
					    import requests
 | 
				
			||||||
    import arxiv
 | 
					    import arxiv
 | 
				
			||||||
    import difflib
 | 
					 | 
				
			||||||
    from bs4 import BeautifulSoup
 | 
					    from bs4 import BeautifulSoup
 | 
				
			||||||
    from toolbox import get_conf
 | 
					    from toolbox import get_conf
 | 
				
			||||||
 | 
					    from urllib.parse import urlparse
 | 
				
			||||||
 | 
					    import re
 | 
				
			||||||
    proxies, = get_conf('proxies')
 | 
					    proxies, = get_conf('proxies')
 | 
				
			||||||
    headers = {
 | 
					    headers = {
 | 
				
			||||||
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
 | 
					        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
 | 
				
			||||||
@ -18,8 +19,33 @@ def get_meta_information(url, chatbot, history):
 | 
				
			|||||||
    # 解析网页内容
 | 
					    # 解析网页内容
 | 
				
			||||||
    soup = BeautifulSoup(response.text, "html.parser")
 | 
					    soup = BeautifulSoup(response.text, "html.parser")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def string_similar(s1, s2):
 | 
					    def search_all_version(url):
 | 
				
			||||||
        return difflib.SequenceMatcher(None, s1, s2).quick_ratio()
 | 
					        response = requests.get(url, proxies=proxies, headers=headers)
 | 
				
			||||||
 | 
					        soup = BeautifulSoup(response.text, "html.parser")
 | 
				
			||||||
 | 
					        for result in soup.select(".gs_ri"):
 | 
				
			||||||
 | 
					            try:
 | 
				
			||||||
 | 
					                url = result.select_one(".gs_rt").a['href']
 | 
				
			||||||
 | 
					            except:
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					            arxiv_id = extract_arxiv_id(url)
 | 
				
			||||||
 | 
					            if not arxiv_id:
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					            search = arxiv.Search(
 | 
				
			||||||
 | 
					                id_list = [arxiv_id],
 | 
				
			||||||
 | 
					                max_results = 1,
 | 
				
			||||||
 | 
					                sort_by = arxiv.SortCriterion.Relevance,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            return search
 | 
				
			||||||
 | 
					        return None
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    def extract_arxiv_id(url):
 | 
				
			||||||
 | 
					        # 返回给定的url解析出的arxiv_id,如url未成功匹配返回None
 | 
				
			||||||
 | 
					        pattern = r'arxiv.org/abs/([^/]+)'
 | 
				
			||||||
 | 
					        match = re.search(pattern, url)
 | 
				
			||||||
 | 
					        if match:
 | 
				
			||||||
 | 
					            return match.group(1)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            return None
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
    profile = []
 | 
					    profile = []
 | 
				
			||||||
    # 获取所有文章的标题和作者
 | 
					    # 获取所有文章的标题和作者
 | 
				
			||||||
@ -31,17 +57,14 @@ def get_meta_information(url, chatbot, history):
 | 
				
			|||||||
        except:
 | 
					        except:
 | 
				
			||||||
            citation = 'cited by 0'
 | 
					            citation = 'cited by 0'
 | 
				
			||||||
        abstract = result.select_one(".gs_rs").text.strip()  # 摘要在 .gs_rs 中的文本,需要清除首尾空格
 | 
					        abstract = result.select_one(".gs_rs").text.strip()  # 摘要在 .gs_rs 中的文本,需要清除首尾空格
 | 
				
			||||||
        search = arxiv.Search(
 | 
					        other_versions = next((tag['href'] for tag in result.select_one('.gs_flb').select('.gs_nph') if 'cluster' in tag['href']), None)  # 获取所有版本的链接
 | 
				
			||||||
            query = title,
 | 
					        search = search_all_version('http://' + urlparse(url).netloc + other_versions)
 | 
				
			||||||
            max_results = 1,
 | 
					 | 
				
			||||||
            sort_by = arxiv.SortCriterion.Relevance,
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
 | 
					            if search:
 | 
				
			||||||
                paper = next(search.results())
 | 
					                paper = next(search.results())
 | 
				
			||||||
            if string_similar(title, paper.title) > 0.90: # same paper
 | 
					 | 
				
			||||||
                abstract = paper.summary.replace('\n', ' ')
 | 
					                abstract = paper.summary.replace('\n', ' ')
 | 
				
			||||||
                is_paper_in_arxiv = True
 | 
					                is_paper_in_arxiv = True
 | 
				
			||||||
            else:   # different paper
 | 
					            else:   # not found
 | 
				
			||||||
                abstract = abstract
 | 
					                abstract = abstract
 | 
				
			||||||
                is_paper_in_arxiv = False
 | 
					                is_paper_in_arxiv = False
 | 
				
			||||||
            paper = next(search.results())
 | 
					            paper = next(search.results())
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user