Merge branch 'Euclid_Test' of https://github.com/Euclid-Jie/chatgpt_academic into Euclid-Jie-Euclid_Test
This commit is contained in:
		
						commit
						a71edeea95
					
				@ -11,6 +11,7 @@ def 解析PDF(file_manifest, project_folder, top_p, temperature, chatbot, histor
 | 
			
		||||
            file_content = ""
 | 
			
		||||
            for page in doc:
 | 
			
		||||
                file_content += page.get_text()
 | 
			
		||||
                file_content = file_content.encode('gbk', 'ignore').decode('gbk')
 | 
			
		||||
            print(file_content)
 | 
			
		||||
 | 
			
		||||
        prefix = "接下来请你逐文件分析下面的论文文件,概括其内容" if index==0 else ""
 | 
			
		||||
 | 
			
		||||
@ -1,14 +1,19 @@
 | 
			
		||||
from predict import predict_no_ui
 | 
			
		||||
from toolbox import CatchException, report_execption, write_results_to_file, predict_no_ui_but_counting_down
 | 
			
		||||
from toolbox import CatchException, report_execption, write_results_to_file, predict_no_ui_but_counting_down, readPdf
 | 
			
		||||
fast_debug = False
 | 
			
		||||
from bs4 import BeautifulSoup
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def 解析Paper(file_manifest, project_folder, top_p, temperature, chatbot, history, systemPromptTxt):
 | 
			
		||||
    import time, glob, os
 | 
			
		||||
    print('begin analysis on:', file_manifest)
 | 
			
		||||
    for index, fp in enumerate(file_manifest):
 | 
			
		||||
        with open(fp, 'r', encoding='utf-8') as f:
 | 
			
		||||
            file_content = f.read()
 | 
			
		||||
        if ".tex" in fp:
 | 
			
		||||
            with open(fp, 'r', encoding='utf-8') as f:
 | 
			
		||||
                file_content = f.read()
 | 
			
		||||
        if ".pdf" in fp.lower():
 | 
			
		||||
            file_content = readPdf(fp)
 | 
			
		||||
            file_content = BeautifulSoup(''.join(file_content), features="lxml").body.text.encode('gbk', 'ignore').decode('gbk')
 | 
			
		||||
 | 
			
		||||
        prefix = "接下来请你逐文件分析下面的论文文件,概括其内容" if index==0 else ""
 | 
			
		||||
        i_say = prefix + f'请对下面的文章片段用中文做一个概述,文件名是{os.path.relpath(fp, project_folder)},文章内容是 ```{file_content}```'
 | 
			
		||||
@ -60,11 +65,12 @@ def 读文章写摘要(txt, top_p, temperature, chatbot, history, systemPromptTx
 | 
			
		||||
        report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
 | 
			
		||||
        yield chatbot, history, '正常'
 | 
			
		||||
        return
 | 
			
		||||
    file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] # + \
 | 
			
		||||
    file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] + \
 | 
			
		||||
                    [f for f in glob.glob(f'{project_folder}/**/*.pdf', recursive=True)] # + \
 | 
			
		||||
                    # [f for f in glob.glob(f'{project_folder}/**/*.cpp', recursive=True)] + \
 | 
			
		||||
                    # [f for f in glob.glob(f'{project_folder}/**/*.c', recursive=True)]
 | 
			
		||||
    if len(file_manifest) == 0:
 | 
			
		||||
        report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}")
 | 
			
		||||
        report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex或pdf文件: {txt}")
 | 
			
		||||
        yield chatbot, history, '正常'
 | 
			
		||||
        return
 | 
			
		||||
    yield from 解析Paper(file_manifest, project_folder, top_p, temperature, chatbot, history, systemPromptTxt)
 | 
			
		||||
 | 
			
		||||
@ -30,7 +30,7 @@ def get_crazy_functionals():
 | 
			
		||||
            "Color": "stop",    # 按钮颜色
 | 
			
		||||
            "Function": 解析一个C项目
 | 
			
		||||
        },
 | 
			
		||||
        "读tex论文写摘要": {
 | 
			
		||||
        "读tex or pdf论文写摘要": {
 | 
			
		||||
            "Color": "stop",    # 按钮颜色
 | 
			
		||||
            "Function": 读文章写摘要
 | 
			
		||||
        },
 | 
			
		||||
 | 
			
		||||
@ -1,3 +1,10 @@
 | 
			
		||||
gradio>=3.23
 | 
			
		||||
requests[socks]
 | 
			
		||||
mdtex2html
 | 
			
		||||
requests[socks]~=2.28.2
 | 
			
		||||
mdtex2html~=1.2.0
 | 
			
		||||
Markdown~=3.4.3
 | 
			
		||||
latex2mathml~=3.75.1
 | 
			
		||||
bs4~=0.0.1
 | 
			
		||||
lxml~=4.6.4
 | 
			
		||||
beautifulsoup4~=4.12.0
 | 
			
		||||
numpy~=1.24.2
 | 
			
		||||
pdfminer.six
 | 
			
		||||
							
								
								
									
										56
									
								
								toolbox.py
									
									
									
									
									
								
							
							
						
						
									
										56
									
								
								toolbox.py
									
									
									
									
									
								
							@ -1,6 +1,14 @@
 | 
			
		||||
import markdown, mdtex2html, threading, importlib, traceback
 | 
			
		||||
from show_math import convert as convert_math
 | 
			
		||||
from functools import wraps
 | 
			
		||||
import pdfminer
 | 
			
		||||
from pdfminer.pdfparser import PDFParser
 | 
			
		||||
from pdfminer.pdfdocument import PDFDocument
 | 
			
		||||
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
 | 
			
		||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
 | 
			
		||||
from pdfminer.pdfdevice import PDFDevice
 | 
			
		||||
from pdfminer.layout import LAParams
 | 
			
		||||
from pdfminer.converter import PDFPageAggregator
 | 
			
		||||
 | 
			
		||||
def predict_no_ui_but_counting_down(i_say, i_say_show_user, chatbot, top_p, temperature, history=[], sys_prompt=''):
 | 
			
		||||
    """
 | 
			
		||||
@ -236,3 +244,51 @@ def clear_line_break(txt):
 | 
			
		||||
    txt = txt.replace('  ', ' ')
 | 
			
		||||
    txt = txt.replace('  ', ' ')
 | 
			
		||||
    return txt
 | 
			
		||||
 | 
			
		||||
def readPdf(pdfPath):
 | 
			
		||||
    """
 | 
			
		||||
    读取pdf文件,返回文本内容
 | 
			
		||||
    """
 | 
			
		||||
    fp = open(pdfPath, 'rb')
 | 
			
		||||
 | 
			
		||||
    # Create a PDF parser object associated with the file object
 | 
			
		||||
    parser = PDFParser(fp)
 | 
			
		||||
 | 
			
		||||
    # Create a PDF document object that stores the document structure.
 | 
			
		||||
    # Password for initialization as 2nd parameter
 | 
			
		||||
    document = PDFDocument(parser)
 | 
			
		||||
    # Check if the document allows text extraction. If not, abort.
 | 
			
		||||
    if not document.is_extractable:
 | 
			
		||||
        raise PDFTextExtractionNotAllowed
 | 
			
		||||
 | 
			
		||||
    # Create a PDF resource manager object that stores shared resources.
 | 
			
		||||
    rsrcmgr = PDFResourceManager()
 | 
			
		||||
 | 
			
		||||
    # Create a PDF device object.
 | 
			
		||||
    # device = PDFDevice(rsrcmgr)
 | 
			
		||||
 | 
			
		||||
    # BEGIN LAYOUT ANALYSIS.
 | 
			
		||||
    # Set parameters for analysis.
 | 
			
		||||
    laparams = LAParams(
 | 
			
		||||
        char_margin=10.0,
 | 
			
		||||
        line_margin=0.2,
 | 
			
		||||
        boxes_flow=0.2,
 | 
			
		||||
        all_texts=False,
 | 
			
		||||
    )
 | 
			
		||||
    # Create a PDF page aggregator object.
 | 
			
		||||
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
 | 
			
		||||
    # Create a PDF interpreter object.
 | 
			
		||||
    interpreter = PDFPageInterpreter(rsrcmgr, device)
 | 
			
		||||
 | 
			
		||||
    # loop over all pages in the document
 | 
			
		||||
    outTextList = []
 | 
			
		||||
    for page in PDFPage.create_pages(document):
 | 
			
		||||
        # read the page into a layout object
 | 
			
		||||
        interpreter.process_page(page)
 | 
			
		||||
        layout = device.get_result()
 | 
			
		||||
        for obj in layout._objs:
 | 
			
		||||
            if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
 | 
			
		||||
                # print(obj.get_text())
 | 
			
		||||
                outTextList.append(obj.get_text())
 | 
			
		||||
 | 
			
		||||
    return outTextList
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user