diff --git a/crazy_functions/crazy_utils.py b/crazy_functions/crazy_utils.py index 4d3b195..e7e625b 100644 --- a/crazy_functions/crazy_utils.py +++ b/crazy_functions/crazy_utils.py @@ -466,6 +466,9 @@ def read_and_clean_pdf_text(fp): return True else: return False + # 对于某些PDF会有第一个段落就以小写字母开头,为了避免索引错误将其更改为大写 + if starts_with_lowercase_word(meta_txt[0]): + meta_txt[0] = meta_txt[0].capitalize() for _ in range(100): for index, block_txt in enumerate(meta_txt): if starts_with_lowercase_word(block_txt): diff --git a/crazy_functions/latex_fns/latex_toolbox.py b/crazy_functions/latex_fns/latex_toolbox.py index 0a6a873..964507c 100644 --- a/crazy_functions/latex_fns/latex_toolbox.py +++ b/crazy_functions/latex_fns/latex_toolbox.py @@ -250,8 +250,8 @@ def find_main_tex_file(file_manifest, mode): else: # if len(canidates) >= 2 通过一些Latex模板中常见(但通常不会出现在正文)的单词,对不同latex源文件扣分,取评分最高者返回 canidates_score = [] # 给出一些判定模板文档的词作为扣分项 - unexpected_words = ['\LaTeX', 'manuscript', 'Guidelines', 'font', 'citations', 'rejected', 'blind review', 'reviewers'] - expected_words = ['\input', '\ref', '\cite'] + unexpected_words = ['\\LaTeX', 'manuscript', 'Guidelines', 'font', 'citations', 'rejected', 'blind review', 'reviewers'] + expected_words = ['\\input', '\\ref', '\\cite'] for texf in canidates: canidates_score.append(0) with open(texf, 'r', encoding='utf8', errors='ignore') as f: diff --git a/crazy_functions/pdf_fns/breakdown_txt.py b/crazy_functions/pdf_fns/breakdown_txt.py index a961481..e7c7673 100644 --- a/crazy_functions/pdf_fns/breakdown_txt.py +++ b/crazy_functions/pdf_fns/breakdown_txt.py @@ -65,10 +65,10 @@ def cut(limit, get_token_fn, txt_tocut, must_break_at_empty_line, break_anyway=F # 如果没有找到合适的切分点 if break_anyway: # 是否允许暴力切分 - prev, post = force_breakdown(txt_tocut, limit, get_token_fn) + prev, post = force_breakdown(remain_txt_to_cut, limit, get_token_fn) else: # 不允许直接报错 - raise RuntimeError(f"存在一行极长的文本!{txt_tocut}") + raise RuntimeError(f"存在一行极长的文本!{remain_txt_to_cut}") # 追加列表 res.append(prev); fin_len+=len(prev)