From aba871342f984fa0f2f50ccf9178f0cf65b6117a Mon Sep 17 00:00:00 2001
From: Menghuan1918 <menghuan2003@outlook.com>
Date: Wed, 3 Jan 2024 19:49:17 +0800
Subject: [PATCH 1/2] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=88=86=E5=89=B2?=
 =?UTF-8?q?=E5=87=BD=E6=95=B0=E4=B8=AD=E4=BD=BF=E7=94=A8=E7=9A=84=E5=8F=98?=
 =?UTF-8?q?=E9=87=8F=E9=94=99=E8=AF=AF=20(#1443)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Fix force_breakdown function parameter name

* Add handling for PDFs with lowercase starting paragraphs

* Change first lowercase word in meta_txt to uppercase
---
 crazy_functions/crazy_utils.py           | 3 +++
 crazy_functions/pdf_fns/breakdown_txt.py | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/crazy_functions/crazy_utils.py b/crazy_functions/crazy_utils.py
index 4d3b195..e7e625b 100644
--- a/crazy_functions/crazy_utils.py
+++ b/crazy_functions/crazy_utils.py
@@ -466,6 +466,9 @@ def read_and_clean_pdf_text(fp):
                     return True
                 else:
                     return False
+            # 对于某些PDF会有第一个段落就以小写字母开头,为了避免索引错误将其更改为大写
+            if starts_with_lowercase_word(meta_txt[0]):
+                meta_txt[0] = meta_txt[0].capitalize()
             for _ in range(100):
                 for index, block_txt in enumerate(meta_txt):
                     if starts_with_lowercase_word(block_txt):
diff --git a/crazy_functions/pdf_fns/breakdown_txt.py b/crazy_functions/pdf_fns/breakdown_txt.py
index a961481..e7c7673 100644
--- a/crazy_functions/pdf_fns/breakdown_txt.py
+++ b/crazy_functions/pdf_fns/breakdown_txt.py
@@ -65,10 +65,10 @@ def cut(limit, get_token_fn, txt_tocut, must_break_at_empty_line, break_anyway=F
                 # 如果没有找到合适的切分点
                 if break_anyway:
                     # 是否允许暴力切分
-                    prev, post = force_breakdown(txt_tocut, limit, get_token_fn)
+                    prev, post = force_breakdown(remain_txt_to_cut, limit, get_token_fn)
                 else:
                     # 不允许直接报错
-                    raise RuntimeError(f"存在一行极长的文本！{txt_tocut}")
+                    raise RuntimeError(f"存在一行极长的文本！{remain_txt_to_cut}")
 
             # 追加列表
             res.append(prev); fin_len+=len(prev)

From d883c7f34bcbb60b45767fd7eedeba2a703b7f13 Mon Sep 17 00:00:00 2001
From: fzcqc <140309989+fzcqc@users.noreply.github.com>
Date: Wed, 3 Jan 2024 19:57:10 +0800
Subject: [PATCH 2/2] =?UTF-8?q?fix:=20expected=5Fwords=E6=B7=BB=E5=8A=A0?=
 =?UTF-8?q?=E5=8F=8D=E6=96=9C=E6=9D=86=20(#1442)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 crazy_functions/latex_fns/latex_toolbox.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/crazy_functions/latex_fns/latex_toolbox.py b/crazy_functions/latex_fns/latex_toolbox.py
index 0a6a873..964507c 100644
--- a/crazy_functions/latex_fns/latex_toolbox.py
+++ b/crazy_functions/latex_fns/latex_toolbox.py
@@ -250,8 +250,8 @@ def find_main_tex_file(file_manifest, mode):
     else: # if len(canidates) >= 2 通过一些Latex模板中常见（但通常不会出现在正文）的单词，对不同latex源文件扣分，取评分最高者返回
         canidates_score = []
         # 给出一些判定模板文档的词作为扣分项
-        unexpected_words = ['\LaTeX', 'manuscript', 'Guidelines', 'font', 'citations', 'rejected', 'blind review', 'reviewers']
-        expected_words = ['\input', '\ref', '\cite']
+        unexpected_words = ['\\LaTeX', 'manuscript', 'Guidelines', 'font', 'citations', 'rejected', 'blind review', 'reviewers']
+        expected_words = ['\\input', '\\ref', '\\cite']
         for texf in canidates:
             canidates_score.append(0)
             with open(texf, 'r', encoding='utf8', errors='ignore') as f: