From 28d777a96bfb92f86267797c5e2e6bc6ba0b7ae7 Mon Sep 17 00:00:00 2001 From: binary-husky Date: Sun, 10 Sep 2023 16:52:35 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=AD=A3=E6=8A=A5=E9=94=99=E6=B6=88?= =?UTF-8?q?=E6=81=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crazy_functions/crazy_utils.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/crazy_functions/crazy_utils.py b/crazy_functions/crazy_utils.py index 5a314b3..1567172 100644 --- a/crazy_functions/crazy_utils.py +++ b/crazy_functions/crazy_utils.py @@ -469,14 +469,16 @@ def read_and_clean_pdf_text(fp): '- ', '') for t in text_areas['blocks'] if 'lines' in t] ############################## <第 2 步,获取正文主字体> ################################## - fsize_statiscs = {} - for span in meta_span: - if span[1] not in fsize_statiscs: fsize_statiscs[span[1]] = 0 - fsize_statiscs[span[1]] += span[2] - main_fsize = max(fsize_statiscs, key=fsize_statiscs.get) - if REMOVE_FOOT_NOTE: - give_up_fize_threshold = main_fsize * REMOVE_FOOT_FFSIZE_PERCENT - + try: + fsize_statiscs = {} + for span in meta_span: + if span[1] not in fsize_statiscs: fsize_statiscs[span[1]] = 0 + fsize_statiscs[span[1]] += span[2] + main_fsize = max(fsize_statiscs, key=fsize_statiscs.get) + if REMOVE_FOOT_NOTE: + give_up_fize_threshold = main_fsize * REMOVE_FOOT_FFSIZE_PERCENT + except: + raise RuntimeError(f'抱歉, 我们暂时无法解析此PDF文档: {fp}。') ############################## <第 3 步,切分和重新整合> ################################## mega_sec = [] sec = []