修复分割函数中使用的变量错误 (#1443)
* Fix force_breakdown function parameter name * Add handling for PDFs with lowercase starting paragraphs * Change first lowercase word in meta_txt to uppercase
This commit is contained in:
parent
a96f842b3a
commit
aba871342f
@ -466,6 +466,9 @@ def read_and_clean_pdf_text(fp):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
# 对于某些PDF会有第一个段落就以小写字母开头,为了避免索引错误将其更改为大写
|
||||
if starts_with_lowercase_word(meta_txt[0]):
|
||||
meta_txt[0] = meta_txt[0].capitalize()
|
||||
for _ in range(100):
|
||||
for index, block_txt in enumerate(meta_txt):
|
||||
if starts_with_lowercase_word(block_txt):
|
||||
|
@ -65,10 +65,10 @@ def cut(limit, get_token_fn, txt_tocut, must_break_at_empty_line, break_anyway=F
|
||||
# 如果没有找到合适的切分点
|
||||
if break_anyway:
|
||||
# 是否允许暴力切分
|
||||
prev, post = force_breakdown(txt_tocut, limit, get_token_fn)
|
||||
prev, post = force_breakdown(remain_txt_to_cut, limit, get_token_fn)
|
||||
else:
|
||||
# 不允许直接报错
|
||||
raise RuntimeError(f"存在一行极长的文本!{txt_tocut}")
|
||||
raise RuntimeError(f"存在一行极长的文本!{remain_txt_to_cut}")
|
||||
|
||||
# 追加列表
|
||||
res.append(prev); fin_len+=len(prev)
|
||||
|
Loading…
x
Reference in New Issue
Block a user