Merge branch 'master' into frontier

2024-01-05 16:12:41 +08:00 · 2024-01-05 16:12:41 +08:00 · 2abe665521
commit 2abe665521
parent b0e6c4d365 d883c7f34b
3 changed files with 7 additions and 4 deletions
--- a/crazy_functions/crazy_utils.py
+++ b/crazy_functions/crazy_utils.py
@ -466,6 +466,9 @@ def read_and_clean_pdf_text(fp):
                    return True
                else:
                    return False
            # 对于某些PDF会有第一个段落就以小写字母开头,为了避免索引错误将其更改为大写
            if starts_with_lowercase_word(meta_txt[0]):
                meta_txt[0] = meta_txt[0].capitalize()
            for _ in range(100):
                for index, block_txt in enumerate(meta_txt):
                    if starts_with_lowercase_word(block_txt):
--- a/crazy_functions/latex_fns/latex_toolbox.py
+++ b/crazy_functions/latex_fns/latex_toolbox.py
@ -250,8 +250,8 @@ def find_main_tex_file(file_manifest, mode):
    else: # if len(canidates) >= 2 通过一些Latex模板中常见（但通常不会出现在正文）的单词，对不同latex源文件扣分，取评分最高者返回
        canidates_score = []
        # 给出一些判定模板文档的词作为扣分项
-        unexpected_words = ['\LaTeX', 'manuscript', 'Guidelines', 'font', 'citations', 'rejected', 'blind review', 'reviewers']
+        unexpected_words = ['\\LaTeX', 'manuscript', 'Guidelines', 'font', 'citations', 'rejected', 'blind review', 'reviewers']
-        expected_words = ['\input', '\ref', '\cite']
+        expected_words = ['\\input', '\\ref', '\\cite']
        for texf in canidates:
            canidates_score.append(0)
            with open(texf, 'r', encoding='utf8', errors='ignore') as f:
--- a/crazy_functions/pdf_fns/breakdown_txt.py
+++ b/crazy_functions/pdf_fns/breakdown_txt.py
@ -65,10 +65,10 @@ def cut(limit, get_token_fn, txt_tocut, must_break_at_empty_line, break_anyway=F
                # 如果没有找到合适的切分点
                if break_anyway:
                    # 是否允许暴力切分
-                    prev, post = force_breakdown(txt_tocut, limit, get_token_fn)
+                    prev, post = force_breakdown(remain_txt_to_cut, limit, get_token_fn)
                else:
                    # 不允许直接报错
-                    raise RuntimeError(f"存在一行极长的文本！{txt_tocut}")
+                    raise RuntimeError(f"存在一行极长的文本！{remain_txt_to_cut}")
            # 追加列表
            res.append(prev); fin_len+=len(prev)