From c53320182ac4f843527ef6077a9baad902378f48 Mon Sep 17 00:00:00 2001 From: binary-husky <505030475@qq.com> Date: Sat, 29 Apr 2023 01:51:11 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dnewbing=E5=BC=95=E7=94=A8?= =?UTF-8?q?=E6=A0=B7=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/test_markdown_format.py | 130 ++++++++++++++++++++++++++++++++++ request_llm/bridge_newbing.py | 4 +- toolbox.py | 8 ++- 3 files changed, 139 insertions(+), 3 deletions(-) create mode 100644 docs/test_markdown_format.py diff --git a/docs/test_markdown_format.py b/docs/test_markdown_format.py new file mode 100644 index 0000000..896f6f1 --- /dev/null +++ b/docs/test_markdown_format.py @@ -0,0 +1,130 @@ +sample = """ +[1]: https://baike.baidu.com/item/%E8%B4%A8%E8%83%BD%E6%96%B9%E7%A8%8B/1884527 "质能方程(质能方程式)_百度百科" +[2]: https://www.zhihu.com/question/348249281 "如何理解质能方程 E=mc²? - 知乎" +[3]: https://zhuanlan.zhihu.com/p/32597385 "质能方程的推导与理解 - 知乎 - 知乎专栏" + +你好,这是必应。质能方程是描述质量与能量之间的当量关系的方程[^1^][1]。用tex格式,质能方程可以写成$$E=mc^2$$,其中$E$是能量,$m$是质量,$c$是光速[^2^][2] [^3^][3]。 +""" +import re + +def preprocess_newbing_out(s): + pattern = r'\^(\d+)\^' # 匹配^数字^ + pattern2 = r'\[(\d+)\]' # 匹配^数字^ + sub = lambda m: '\['+m.group(1)+'\]' # 将匹配到的数字作为替换值 + result = re.sub(pattern, sub, s) # 替换操作 + if '[1]' in result: + result += '


' + "
".join([re.sub(pattern2, sub, r) for r in result.split('\n') if r.startswith('[')]) + '
' + return result + + +def close_up_code_segment_during_stream(gpt_reply): + """ + 在gpt输出代码的中途(输出了前面的```,但还没输出完后面的```),补上后面的``` + + Args: + gpt_reply (str): GPT模型返回的回复字符串。 + + Returns: + str: 返回一个新的字符串,将输出代码片段的“后面的```”补上。 + + """ + if '```' not in gpt_reply: + return gpt_reply + if gpt_reply.endswith('```'): + return gpt_reply + + # 排除了以上两个情况,我们 + segments = gpt_reply.split('```') + n_mark = len(segments) - 1 + if n_mark % 2 == 1: + # print('输出代码片段中!') + return gpt_reply+'\n```' + else: + return gpt_reply + +import markdown +from latex2mathml.converter import convert as tex2mathml +from functools import wraps, lru_cache +def markdown_convertion(txt): + """ + 将Markdown格式的文本转换为HTML格式。如果包含数学公式,则先将公式转换为HTML格式。 + """ + pre = '
' + suf = '
' + if txt.startswith(pre) and txt.endswith(suf): + # print('警告,输入了已经经过转化的字符串,二次转化可能出问题') + return txt # 已经被转化过,不需要再次转化 + + markdown_extension_configs = { + 'mdx_math': { + 'enable_dollar_delimiter': True, + 'use_gitlab_delimiters': False, + }, + } + find_equation_pattern = r'\n', '') + return content + + + if ('$' in txt) and ('```' not in txt): # 有$标识的公式符号,且没有代码段```的标识 + # convert everything to html format + split = markdown.markdown(text='---') + convert_stage_1 = markdown.markdown(text=txt, extensions=['mdx_math', 'fenced_code', 'tables', 'sane_lists'], extension_configs=markdown_extension_configs) + convert_stage_1 = markdown_bug_hunt(convert_stage_1) + # re.DOTALL: Make the '.' special character match any character at all, including a newline; without this flag, '.' will match anything except a newline. Corresponds to the inline flag (?s). + # 1. convert to easy-to-copy tex (do not render math) + convert_stage_2_1, n = re.subn(find_equation_pattern, replace_math_no_render, convert_stage_1, flags=re.DOTALL) + # 2. convert to rendered equation + convert_stage_2_2, n = re.subn(find_equation_pattern, replace_math_render, convert_stage_1, flags=re.DOTALL) + # cat them together + return pre + convert_stage_2_1 + f'{split}' + convert_stage_2_2 + suf + else: + return pre + markdown.markdown(txt, extensions=['fenced_code', 'codehilite', 'tables', 'sane_lists']) + suf + + +sample = preprocess_newbing_out(sample) +sample = close_up_code_segment_during_stream(sample) +sample = markdown_convertion(sample) +with open('tmp.html', 'w', encoding='utf8') as f: + f.write(""" + + + My Website + + + + """) + f.write(sample) diff --git a/request_llm/bridge_newbing.py b/request_llm/bridge_newbing.py index 66db8b6..2fa4761 100644 --- a/request_llm/bridge_newbing.py +++ b/request_llm/bridge_newbing.py @@ -27,12 +27,12 @@ def preprocess_newbing_out(s): sub = lambda m: '\['+m.group(1)+'\]' # 将匹配到的数字作为替换值 result = re.sub(pattern, sub, s) # 替换操作 if '[1]' in result: - result += '\n\n
\n\n' + "\n\n".join(['`'+r+'`' for r in result.split('\n') if r.startswith('[')]) + result += '\n\n```reference\n' + "\n".join([r for r in result.split('\n') if r.startswith('[')]) + '\n```\n' return result def preprocess_newbing_out_simple(result): if '[1]' in result: - result += '\n\n```\n' + "\n".join([r for r in result.split('\n') if r.startswith('[')]) + '\n```\n' + result += '\n\n```reference\n' + "\n".join([r for r in result.split('\n') if r.startswith('[')]) + '\n```\n' return result class NewBingHandle(Process): diff --git a/toolbox.py b/toolbox.py index af28e8d..c09ea74 100644 --- a/toolbox.py +++ b/toolbox.py @@ -271,8 +271,14 @@ def markdown_convertion(txt): content = content.replace('\n', '') return content + def no_code(txt): + if '```' not in txt: + return True + else: + if '```reference' in txt: return True # newbing + else: return False - if ('$' in txt) and ('```' not in txt): # 有$标识的公式符号,且没有代码段```的标识 + if ('$' in txt) and no_code(txt): # 有$标识的公式符号,且没有代码段```的标识 # convert everything to html format split = markdown.markdown(text='---') convert_stage_1 = markdown.markdown(text=txt, extensions=['mdx_math', 'fenced_code', 'tables', 'sane_lists'], extension_configs=markdown_extension_configs)