修复分割函数中使用的变量错误 (#1443)
* Fix force_breakdown function parameter name * Add handling for PDFs with lowercase starting paragraphs * Change first lowercase word in meta_txt to uppercase
This commit is contained in:
parent
a96f842b3a
commit
aba871342f
@ -466,6 +466,9 @@ def read_and_clean_pdf_text(fp):
|
|||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
# 对于某些PDF会有第一个段落就以小写字母开头,为了避免索引错误将其更改为大写
|
||||||
|
if starts_with_lowercase_word(meta_txt[0]):
|
||||||
|
meta_txt[0] = meta_txt[0].capitalize()
|
||||||
for _ in range(100):
|
for _ in range(100):
|
||||||
for index, block_txt in enumerate(meta_txt):
|
for index, block_txt in enumerate(meta_txt):
|
||||||
if starts_with_lowercase_word(block_txt):
|
if starts_with_lowercase_word(block_txt):
|
||||||
|
@ -65,10 +65,10 @@ def cut(limit, get_token_fn, txt_tocut, must_break_at_empty_line, break_anyway=F
|
|||||||
# 如果没有找到合适的切分点
|
# 如果没有找到合适的切分点
|
||||||
if break_anyway:
|
if break_anyway:
|
||||||
# 是否允许暴力切分
|
# 是否允许暴力切分
|
||||||
prev, post = force_breakdown(txt_tocut, limit, get_token_fn)
|
prev, post = force_breakdown(remain_txt_to_cut, limit, get_token_fn)
|
||||||
else:
|
else:
|
||||||
# 不允许直接报错
|
# 不允许直接报错
|
||||||
raise RuntimeError(f"存在一行极长的文本!{txt_tocut}")
|
raise RuntimeError(f"存在一行极长的文本!{remain_txt_to_cut}")
|
||||||
|
|
||||||
# 追加列表
|
# 追加列表
|
||||||
res.append(prev); fin_len+=len(prev)
|
res.append(prev); fin_len+=len(prev)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user