chatgpt_academic/crazy_functions/下载arxiv论文翻译摘要.py
qingxu fu ea031ab05b st
2023-04-09 19:49:42 +08:00

188 lines
6.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from toolbox import update_ui
from request_llm.bridge_chatgpt import predict_no_ui
from toolbox import CatchException, report_execption, write_results_to_file, predict_no_ui_but_counting_down, get_conf
import re, requests, unicodedata, os
def download_arxiv_(url_pdf):
if 'arxiv.org' not in url_pdf:
if ('.' in url_pdf) and ('/' not in url_pdf):
new_url = 'https://arxiv.org/abs/'+url_pdf
print('下载编号:', url_pdf, '自动定位:', new_url)
# download_arxiv_(new_url)
return download_arxiv_(new_url)
else:
print('不能识别的URL')
return None
if 'abs' in url_pdf:
url_pdf = url_pdf.replace('abs', 'pdf')
url_pdf = url_pdf + '.pdf'
url_abs = url_pdf.replace('.pdf', '').replace('pdf', 'abs')
title, other_info = get_name(_url_=url_abs)
paper_id = title.split()[0] # '[1712.00559]'
if '2' in other_info['year']:
title = other_info['year'] + ' ' + title
known_conf = ['NeurIPS', 'NIPS', 'Nature', 'Science', 'ICLR', 'AAAI']
for k in known_conf:
if k in other_info['comment']:
title = k + ' ' + title
download_dir = './gpt_log/arxiv/'
os.makedirs(download_dir, exist_ok=True)
title_str = title.replace('?', '')\
.replace(':', '')\
.replace('\"', '')\
.replace('\n', '')\
.replace(' ', ' ')\
.replace(' ', ' ')
requests_pdf_url = url_pdf
file_path = download_dir+title_str
# if os.path.exists(file_path):
# print('返回缓存文件')
# return './gpt_log/arxiv/'+title_str
print('下载中')
proxies, = get_conf('proxies')
r = requests.get(requests_pdf_url, proxies=proxies)
with open(file_path, 'wb+') as f:
f.write(r.content)
print('下载完成')
# print('输出下载命令:','aria2c -o \"%s\" %s'%(title_str,url_pdf))
# subprocess.call('aria2c --all-proxy=\"172.18.116.150:11084\" -o \"%s\" %s'%(download_dir+title_str,url_pdf), shell=True)
x = "%s %s %s.bib" % (paper_id, other_info['year'], other_info['authors'])
x = x.replace('?', '')\
.replace(':', '')\
.replace('\"', '')\
.replace('\n', '')\
.replace(' ', ' ')\
.replace(' ', ' ')
return './gpt_log/arxiv/'+title_str, other_info
def get_name(_url_):
import os
from bs4 import BeautifulSoup
print('正在获取文献名!')
print(_url_)
# arxiv_recall = {}
# if os.path.exists('./arxiv_recall.pkl'):
# with open('./arxiv_recall.pkl', 'rb') as f:
# arxiv_recall = pickle.load(f)
# if _url_ in arxiv_recall:
# print('在缓存中')
# return arxiv_recall[_url_]
proxies, = get_conf('proxies')
res = requests.get(_url_, proxies=proxies)
bs = BeautifulSoup(res.text, 'html.parser')
other_details = {}
# get year
try:
year = bs.find_all(class_='dateline')[0].text
year = re.search(r'(\d{4})', year, re.M | re.I).group(1)
other_details['year'] = year
abstract = bs.find_all(class_='abstract mathjax')[0].text
other_details['abstract'] = abstract
except:
other_details['year'] = ''
print('年份获取失败')
# get author
try:
authors = bs.find_all(class_='authors')[0].text
authors = authors.split('Authors:')[1]
other_details['authors'] = authors
except:
other_details['authors'] = ''
print('authors获取失败')
# get comment
try:
comment = bs.find_all(class_='metatable')[0].text
real_comment = None
for item in comment.replace('\n', ' ').split(' '):
if 'Comments' in item:
real_comment = item
if real_comment is not None:
other_details['comment'] = real_comment
else:
other_details['comment'] = ''
except:
other_details['comment'] = ''
print('年份获取失败')
title_str = BeautifulSoup(
res.text, 'html.parser').find('title').contents[0]
print('获取成功:', title_str)
# arxiv_recall[_url_] = (title_str+'.pdf', other_details)
# with open('./arxiv_recall.pkl', 'wb') as f:
# pickle.dump(arxiv_recall, f)
return title_str+'.pdf', other_details
@CatchException
def 下载arxiv论文并翻译摘要(txt, top_p, temperature, chatbot, history, systemPromptTxt, WEB_PORT):
CRAZY_FUNCTION_INFO = "下载arxiv论文并翻译摘要函数插件作者[binary-husky]。正在提取摘要并下载PDF文档……"
import glob
import os
# 基本信息:功能、贡献者
chatbot.append(["函数插件功能?", CRAZY_FUNCTION_INFO])
yield from update_ui(chatbot=chatbot, history=history)
# 尝试导入依赖,如果缺少依赖,则给出安装建议
try:
import pdfminer, bs4
except:
report_execption(chatbot, history,
a = f"解析项目: {txt}",
b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pdfminer beautifulsoup4```。")
yield from update_ui(chatbot=chatbot, history=history)
return
# 清空历史,以免输入溢出
history = []
# 提取摘要下载PDF文档
try:
pdf_path, info = download_arxiv_(txt)
except:
report_execption(chatbot, history,
a = f"解析项目: {txt}",
b = f"下载pdf文件未成功")
yield from update_ui(chatbot=chatbot, history=history)
return
# 翻译摘要等
i_say = f"请你阅读以下学术论文相关的材料,提取摘要,翻译为中文。材料如下:{str(info)}"
i_say_show_user = f'请你阅读以下学术论文相关的材料,提取摘要,翻译为中文。论文:{pdf_path}'
chatbot.append((i_say_show_user, "[Local Message] waiting gpt response."))
yield from update_ui(chatbot=chatbot, history=history)
msg = '正常'
# ** gpt request **
gpt_say = yield from predict_no_ui_but_counting_down(i_say, i_say_show_user, chatbot, top_p, temperature, history=[]) # 带超时倒计时
chatbot[-1] = (i_say_show_user, gpt_say)
history.append(i_say_show_user); history.append(gpt_say)
yield from update_ui(chatbot=chatbot, history=chatbot, msg=msg)
# 写入文件
import shutil
# 重置文件的创建时间
shutil.copyfile(pdf_path, f'./gpt_log/{os.path.basename(pdf_path)}'); os.remove(pdf_path)
res = write_results_to_file(history)
chatbot.append(("完成了吗?", res + "\n\nPDF文件也已经下载"))
yield from update_ui(chatbot=chatbot, history=chatbot, msg=msg)