diff --git a/crazy_functions/latex_utils.py b/crazy_functions/latex_utils.py
index 5a93618..d759377 100644
--- a/crazy_functions/latex_utils.py
+++ b/crazy_functions/latex_utils.py
@@ -200,7 +200,7 @@ class LatexPaperSplit():
for res in pattern_compile.finditer(target_string):
cmd = res.group(1) # begin{what}
this = res.group(2) # content between begin and end
- white_list = ['document', 'abstract', 'lemma', 'definition', 'sproof']
+ white_list = ['document', 'abstract', 'lemma', 'definition', 'sproof', 'em', 'emph', 'textit', 'textbf']
if cmd in white_list or this.count('\n') > 25:
sub_res = search_with_line_limit(this)
if not sub_res: continue
@@ -239,25 +239,12 @@ class LatexPaperSplit():
if lt is None: break
-
# root 是链表的头
print('正在分解Latex源文件,构建链表结构')
+ # 吸收在25行以内的begin-end组合
split_worker_begin_end(root, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=25)
- # 将分解结果返回 res_to_t
- with open(pj(project_folder, 'debug_log.html'), 'w', encoding='utf8') as f:
- res_to_t = []
- node = root
- while True:
- show_html = node.string.replace('\n','
')
- if not node.preserve:
- res_to_t.append(node.string)
- f.write(f'
{show_html}
') - else: - f.write(f'{show_html}
') - node = node.next - if node is None: break - + # 吸收其他杂项 split_worker(root, r"(.*?)\\maketitle", re.DOTALL) split_worker(root, r"\\section\{(.*?)\}") split_worker(root, r"\\section\*\{(.*?)\}") @@ -284,7 +271,10 @@ class LatexPaperSplit(): split_worker(root, r"\\begin\{equation\*\}(.*?)\\end\{equation\*\}", re.DOTALL) split_worker(root, r"\$\$(.*?)\$\$", re.DOTALL) split_worker(root, r"\\item ") + split_worker(root, r"\\label\{(.*?)\}") split_worker(root, r"\\begin\{(.*?)\}") + split_worker(root, r"\\vspace\{(.*?)\}") + split_worker(root, r"\\hspace\{(.*?)\}") split_worker(root, r"\\end\{(.*?)\}") node = root @@ -335,12 +325,29 @@ class LatexPaperSplit(): node = root while True: - if len(node.string.strip('\n').strip(''))==0: node.preserve = True if len(node.string.strip('\n').strip(''))<50: node.preserve = True node = node.next if node is None: break + # 将前后断行符脱离 + node = root + prev_node = None + while True: + if not node.preserve: + lstriped_ = node.string.lstrip().lstrip('\n') + if (prev_node is not None) and (prev_node.preserve) and (len(lstriped_)!=len(node.string)): + prev_node.string += node.string[:-len(lstriped_)] + node.string = lstriped_ + rstriped_ = node.string.rstrip().rstrip('\n') + if (node.next is not None) and (node.next.preserve) and (len(rstriped_)!=len(node.string)): + node.next.string = node.string[len(rstriped_):] + node.next.string + node.string = rstriped_ + # ===== + prev_node = node + node = node.next + if node is None: break + # 将分解结果返回 res_to_t with open(pj(project_folder, 'debug_log.html'), 'w', encoding='utf8') as f: res_to_t = []