Merge branch 'master' into frontier
This commit is contained in:
commit
2abe665521
@ -466,6 +466,9 @@ def read_and_clean_pdf_text(fp):
|
|||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
# 对于某些PDF会有第一个段落就以小写字母开头,为了避免索引错误将其更改为大写
|
||||||
|
if starts_with_lowercase_word(meta_txt[0]):
|
||||||
|
meta_txt[0] = meta_txt[0].capitalize()
|
||||||
for _ in range(100):
|
for _ in range(100):
|
||||||
for index, block_txt in enumerate(meta_txt):
|
for index, block_txt in enumerate(meta_txt):
|
||||||
if starts_with_lowercase_word(block_txt):
|
if starts_with_lowercase_word(block_txt):
|
||||||
|
@ -250,8 +250,8 @@ def find_main_tex_file(file_manifest, mode):
|
|||||||
else: # if len(canidates) >= 2 通过一些Latex模板中常见(但通常不会出现在正文)的单词,对不同latex源文件扣分,取评分最高者返回
|
else: # if len(canidates) >= 2 通过一些Latex模板中常见(但通常不会出现在正文)的单词,对不同latex源文件扣分,取评分最高者返回
|
||||||
canidates_score = []
|
canidates_score = []
|
||||||
# 给出一些判定模板文档的词作为扣分项
|
# 给出一些判定模板文档的词作为扣分项
|
||||||
unexpected_words = ['\LaTeX', 'manuscript', 'Guidelines', 'font', 'citations', 'rejected', 'blind review', 'reviewers']
|
unexpected_words = ['\\LaTeX', 'manuscript', 'Guidelines', 'font', 'citations', 'rejected', 'blind review', 'reviewers']
|
||||||
expected_words = ['\input', '\ref', '\cite']
|
expected_words = ['\\input', '\\ref', '\\cite']
|
||||||
for texf in canidates:
|
for texf in canidates:
|
||||||
canidates_score.append(0)
|
canidates_score.append(0)
|
||||||
with open(texf, 'r', encoding='utf8', errors='ignore') as f:
|
with open(texf, 'r', encoding='utf8', errors='ignore') as f:
|
||||||
|
@ -65,10 +65,10 @@ def cut(limit, get_token_fn, txt_tocut, must_break_at_empty_line, break_anyway=F
|
|||||||
# 如果没有找到合适的切分点
|
# 如果没有找到合适的切分点
|
||||||
if break_anyway:
|
if break_anyway:
|
||||||
# 是否允许暴力切分
|
# 是否允许暴力切分
|
||||||
prev, post = force_breakdown(txt_tocut, limit, get_token_fn)
|
prev, post = force_breakdown(remain_txt_to_cut, limit, get_token_fn)
|
||||||
else:
|
else:
|
||||||
# 不允许直接报错
|
# 不允许直接报错
|
||||||
raise RuntimeError(f"存在一行极长的文本!{txt_tocut}")
|
raise RuntimeError(f"存在一行极长的文本!{remain_txt_to_cut}")
|
||||||
|
|
||||||
# 追加列表
|
# 追加列表
|
||||||
res.append(prev); fin_len+=len(prev)
|
res.append(prev); fin_len+=len(prev)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user