gpt_academic text mask imp
This commit is contained in:
parent
f2e73aa580
commit
142b516749
@ -3,6 +3,7 @@
|
|||||||
# 'stop' 颜色对应 theme.py 中的 color_er
|
# 'stop' 颜色对应 theme.py 中的 color_er
|
||||||
import importlib
|
import importlib
|
||||||
from toolbox import clear_line_break
|
from toolbox import clear_line_break
|
||||||
|
from toolbox import build_gpt_academic_masked_string
|
||||||
from textwrap import dedent
|
from textwrap import dedent
|
||||||
|
|
||||||
def get_core_functions():
|
def get_core_functions():
|
||||||
@ -32,12 +33,12 @@ def get_core_functions():
|
|||||||
"Prefix": r"",
|
"Prefix": r"",
|
||||||
# 后缀,会被加在你的输入之后。例如,配合前缀可以把你的输入内容用引号圈起来
|
# 后缀,会被加在你的输入之后。例如,配合前缀可以把你的输入内容用引号圈起来
|
||||||
"Suffix":
|
"Suffix":
|
||||||
dedent("\n"+r'''
|
dedent("\n"+f'''
|
||||||
==============================
|
==============================
|
||||||
使用mermaid flowchart对以上文本进行总结,概括上述段落的内容以及内在逻辑关系,例如:
|
使用mermaid flowchart对以上文本进行总结,概括上述段落的内容以及内在逻辑关系,例如:
|
||||||
|
|
||||||
以下是对以上文本的总结,以mermaid flowchart的形式展示:
|
以下是对以上文本的总结,以mermaid flowchart的形式展示:
|
||||||
```mermaid
|
```{build_gpt_academic_masked_string(text_show_llm="mermaid", text_show_render="")}
|
||||||
flowchart LR
|
flowchart LR
|
||||||
A["节点名1"] --> B("节点名2")
|
A["节点名1"] --> B("节点名2")
|
||||||
B --> C{"节点名3"}
|
B --> C{"节点名3"}
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
import tiktoken, copy
|
import tiktoken, copy
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from toolbox import get_conf, trimmed_format_exc
|
from toolbox import get_conf, trimmed_format_exc, apply_gpt_academic_string_mask
|
||||||
|
|
||||||
from .bridge_chatgpt import predict_no_ui_long_connection as chatgpt_noui
|
from .bridge_chatgpt import predict_no_ui_long_connection as chatgpt_noui
|
||||||
from .bridge_chatgpt import predict as chatgpt_ui
|
from .bridge_chatgpt import predict as chatgpt_ui
|
||||||
@ -668,6 +668,7 @@ def predict_no_ui_long_connection(inputs, llm_kwargs, history, sys_prompt, obser
|
|||||||
"""
|
"""
|
||||||
import threading, time, copy
|
import threading, time, copy
|
||||||
|
|
||||||
|
inputs = apply_gpt_academic_string_mask(inputs, mode="show_llm")
|
||||||
model = llm_kwargs['llm_model']
|
model = llm_kwargs['llm_model']
|
||||||
n_model = 1
|
n_model = 1
|
||||||
if '&' not in model:
|
if '&' not in model:
|
||||||
@ -741,6 +742,7 @@ def predict(inputs, llm_kwargs, *args, **kwargs):
|
|||||||
additional_fn代表点击的哪个按钮,按钮见functional.py
|
additional_fn代表点击的哪个按钮,按钮见functional.py
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
inputs = apply_gpt_academic_string_mask(inputs, mode="show_llm")
|
||||||
method = model_info[llm_kwargs['llm_model']]["fn_with_ui"] # 如果这里报错,检查config中的AVAIL_LLM_MODELS选项
|
method = model_info[llm_kwargs['llm_model']]["fn_with_ui"] # 如果这里报错,检查config中的AVAIL_LLM_MODELS选项
|
||||||
yield from method(inputs, llm_kwargs, *args, **kwargs)
|
yield from method(inputs, llm_kwargs, *args, **kwargs)
|
||||||
|
|
||||||
|
@ -4,52 +4,47 @@ import os
|
|||||||
import math
|
import math
|
||||||
from textwrap import dedent
|
from textwrap import dedent
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from pymdownx.superfences import fence_div_format, fence_code_format
|
from pymdownx.superfences import fence_code_format
|
||||||
from latex2mathml.converter import convert as tex2mathml
|
from latex2mathml.converter import convert as tex2mathml
|
||||||
from shared_utils.config_loader import get_conf as get_conf
|
from shared_utils.config_loader import get_conf as get_conf
|
||||||
|
from shared_utils.text_mask import apply_gpt_academic_string_mask
|
||||||
pj = os.path.join
|
|
||||||
default_user_name = 'default_user'
|
|
||||||
|
|
||||||
markdown_extension_configs = {
|
markdown_extension_configs = {
|
||||||
'mdx_math': {
|
"mdx_math": {
|
||||||
'enable_dollar_delimiter': True,
|
"enable_dollar_delimiter": True,
|
||||||
'use_gitlab_delimiters': False,
|
"use_gitlab_delimiters": False,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
code_highlight_configs = {
|
code_highlight_configs = {
|
||||||
"pymdownx.superfences": {
|
"pymdownx.superfences": {
|
||||||
'css_class': 'codehilite',
|
"css_class": "codehilite",
|
||||||
"custom_fences": [
|
"custom_fences": [
|
||||||
{
|
{"name": "mermaid", "class": "mermaid", "format": fence_code_format}
|
||||||
'name': 'mermaid',
|
],
|
||||||
'class': 'mermaid',
|
|
||||||
'format': fence_code_format
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
},
|
||||||
"pymdownx.highlight": {
|
"pymdownx.highlight": {
|
||||||
'css_class': 'codehilite',
|
"css_class": "codehilite",
|
||||||
'guess_lang': True,
|
"guess_lang": True,
|
||||||
# 'auto_title': True,
|
# 'auto_title': True,
|
||||||
# 'linenums': True
|
# 'linenums': True
|
||||||
}
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def text_divide_paragraph(text):
|
def text_divide_paragraph(text):
|
||||||
"""
|
"""
|
||||||
将文本按照段落分隔符分割开,生成带有段落标签的HTML代码。
|
将文本按照段落分隔符分割开,生成带有段落标签的HTML代码。
|
||||||
"""
|
"""
|
||||||
pre = '<div class="markdown-body">'
|
pre = '<div class="markdown-body">'
|
||||||
suf = '</div>'
|
suf = "</div>"
|
||||||
if text.startswith(pre) and text.endswith(suf):
|
if text.startswith(pre) and text.endswith(suf):
|
||||||
return text
|
return text
|
||||||
|
|
||||||
if '```' in text:
|
if "```" in text:
|
||||||
# careful input
|
# careful input
|
||||||
return text
|
return text
|
||||||
elif '</div>' in text:
|
elif "</div>" in text:
|
||||||
# careful input
|
# careful input
|
||||||
return text
|
return text
|
||||||
else:
|
else:
|
||||||
@ -71,20 +66,20 @@ def tex2mathml_catch_exception(content, *args, **kwargs):
|
|||||||
|
|
||||||
def replace_math_no_render(match):
|
def replace_math_no_render(match):
|
||||||
content = match.group(1)
|
content = match.group(1)
|
||||||
if 'mode=display' in match.group(0):
|
if "mode=display" in match.group(0):
|
||||||
content = content.replace('\n', '</br>')
|
content = content.replace("\n", "</br>")
|
||||||
return f"<font color=\"#00FF00\">$$</font><font color=\"#FF00FF\">{content}</font><font color=\"#00FF00\">$$</font>"
|
return f'<font color="#00FF00">$$</font><font color="#FF00FF">{content}</font><font color="#00FF00">$$</font>'
|
||||||
else:
|
else:
|
||||||
return f"<font color=\"#00FF00\">$</font><font color=\"#FF00FF\">{content}</font><font color=\"#00FF00\">$</font>"
|
return f'<font color="#00FF00">$</font><font color="#FF00FF">{content}</font><font color="#00FF00">$</font>'
|
||||||
|
|
||||||
|
|
||||||
def replace_math_render(match):
|
def replace_math_render(match):
|
||||||
content = match.group(1)
|
content = match.group(1)
|
||||||
if 'mode=display' in match.group(0):
|
if "mode=display" in match.group(0):
|
||||||
if '\\begin{aligned}' in content:
|
if "\\begin{aligned}" in content:
|
||||||
content = content.replace('\\begin{aligned}', '\\begin{array}')
|
content = content.replace("\\begin{aligned}", "\\begin{array}")
|
||||||
content = content.replace('\\end{aligned}', '\\end{array}')
|
content = content.replace("\\end{aligned}", "\\end{array}")
|
||||||
content = content.replace('&', ' ')
|
content = content.replace("&", " ")
|
||||||
content = tex2mathml_catch_exception(content, display="block")
|
content = tex2mathml_catch_exception(content, display="block")
|
||||||
return content
|
return content
|
||||||
else:
|
else:
|
||||||
@ -95,9 +90,11 @@ def markdown_bug_hunt(content):
|
|||||||
"""
|
"""
|
||||||
解决一个mdx_math的bug(单$包裹begin命令时多余<script>)
|
解决一个mdx_math的bug(单$包裹begin命令时多余<script>)
|
||||||
"""
|
"""
|
||||||
content = content.replace('<script type="math/tex">\n<script type="math/tex; mode=display">',
|
content = content.replace(
|
||||||
'<script type="math/tex; mode=display">')
|
'<script type="math/tex">\n<script type="math/tex; mode=display">',
|
||||||
content = content.replace('</script>\n</script>', '</script>')
|
'<script type="math/tex; mode=display">',
|
||||||
|
)
|
||||||
|
content = content.replace("</script>\n</script>", "</script>")
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
|
||||||
@ -105,25 +102,29 @@ def is_equation(txt):
|
|||||||
"""
|
"""
|
||||||
判定是否为公式 | 测试1 写出洛伦兹定律,使用tex格式公式 测试2 给出柯西不等式,使用latex格式 测试3 写出麦克斯韦方程组
|
判定是否为公式 | 测试1 写出洛伦兹定律,使用tex格式公式 测试2 给出柯西不等式,使用latex格式 测试3 写出麦克斯韦方程组
|
||||||
"""
|
"""
|
||||||
if '```' in txt and '```reference' not in txt: return False
|
if "```" in txt and "```reference" not in txt:
|
||||||
if '$' not in txt and '\\[' not in txt: return False
|
return False
|
||||||
|
if "$" not in txt and "\\[" not in txt:
|
||||||
|
return False
|
||||||
mathpatterns = {
|
mathpatterns = {
|
||||||
r'(?<!\\|\$)(\$)([^\$]+)(\$)': {'allow_multi_lines': False}, # $...$
|
r"(?<!\\|\$)(\$)([^\$]+)(\$)": {"allow_multi_lines": False}, # $...$
|
||||||
r'(?<!\\)(\$\$)([^\$]+)(\$\$)': {'allow_multi_lines': True}, # $$...$$
|
r"(?<!\\)(\$\$)([^\$]+)(\$\$)": {"allow_multi_lines": True}, # $$...$$
|
||||||
r'(?<!\\)(\\\[)(.+?)(\\\])': {'allow_multi_lines': False}, # \[...\]
|
r"(?<!\\)(\\\[)(.+?)(\\\])": {"allow_multi_lines": False}, # \[...\]
|
||||||
# r'(?<!\\)(\\\()(.+?)(\\\))': {'allow_multi_lines': False}, # \(...\)
|
# r'(?<!\\)(\\\()(.+?)(\\\))': {'allow_multi_lines': False}, # \(...\)
|
||||||
# r'(?<!\\)(\\begin{([a-z]+?\*?)})(.+?)(\\end{\2})': {'allow_multi_lines': True}, # \begin...\end
|
# r'(?<!\\)(\\begin{([a-z]+?\*?)})(.+?)(\\end{\2})': {'allow_multi_lines': True}, # \begin...\end
|
||||||
# r'(?<!\\)(\$`)([^`]+)(`\$)': {'allow_multi_lines': False}, # $`...`$
|
# r'(?<!\\)(\$`)([^`]+)(`\$)': {'allow_multi_lines': False}, # $`...`$
|
||||||
}
|
}
|
||||||
matches = []
|
matches = []
|
||||||
for pattern, property in mathpatterns.items():
|
for pattern, property in mathpatterns.items():
|
||||||
flags = re.ASCII | re.DOTALL if property['allow_multi_lines'] else re.ASCII
|
flags = re.ASCII | re.DOTALL if property["allow_multi_lines"] else re.ASCII
|
||||||
matches.extend(re.findall(pattern, txt, flags))
|
matches.extend(re.findall(pattern, txt, flags))
|
||||||
if len(matches) == 0: return False
|
if len(matches) == 0:
|
||||||
|
return False
|
||||||
contain_any_eq = False
|
contain_any_eq = False
|
||||||
illegal_pattern = re.compile(r'[^\x00-\x7F]|echo')
|
illegal_pattern = re.compile(r"[^\x00-\x7F]|echo")
|
||||||
for match in matches:
|
for match in matches:
|
||||||
if len(match) != 3: return False
|
if len(match) != 3:
|
||||||
|
return False
|
||||||
eq_canidate = match[1]
|
eq_canidate = match[1]
|
||||||
if illegal_pattern.search(eq_canidate):
|
if illegal_pattern.search(eq_canidate):
|
||||||
return False
|
return False
|
||||||
@ -134,27 +135,28 @@ def is_equation(txt):
|
|||||||
|
|
||||||
def fix_markdown_indent(txt):
|
def fix_markdown_indent(txt):
|
||||||
# fix markdown indent
|
# fix markdown indent
|
||||||
if (' - ' not in txt) or ('. ' not in txt):
|
if (" - " not in txt) or (". " not in txt):
|
||||||
# do not need to fix, fast escape
|
# do not need to fix, fast escape
|
||||||
return txt
|
return txt
|
||||||
# walk through the lines and fix non-standard indentation
|
# walk through the lines and fix non-standard indentation
|
||||||
lines = txt.split("\n")
|
lines = txt.split("\n")
|
||||||
pattern = re.compile(r'^\s+-')
|
pattern = re.compile(r"^\s+-")
|
||||||
activated = False
|
activated = False
|
||||||
for i, line in enumerate(lines):
|
for i, line in enumerate(lines):
|
||||||
if line.startswith('- ') or line.startswith('1. '):
|
if line.startswith("- ") or line.startswith("1. "):
|
||||||
activated = True
|
activated = True
|
||||||
if activated and pattern.match(line):
|
if activated and pattern.match(line):
|
||||||
stripped_string = line.lstrip()
|
stripped_string = line.lstrip()
|
||||||
num_spaces = len(line) - len(stripped_string)
|
num_spaces = len(line) - len(stripped_string)
|
||||||
if (num_spaces % 4) == 3:
|
if (num_spaces % 4) == 3:
|
||||||
num_spaces_should_be = math.ceil(num_spaces / 4) * 4
|
num_spaces_should_be = math.ceil(num_spaces / 4) * 4
|
||||||
lines[i] = ' ' * num_spaces_should_be + stripped_string
|
lines[i] = " " * num_spaces_should_be + stripped_string
|
||||||
return '\n'.join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
FENCED_BLOCK_RE = re.compile(
|
FENCED_BLOCK_RE = re.compile(
|
||||||
dedent(r'''
|
dedent(
|
||||||
|
r"""
|
||||||
(?P<fence>^[ \t]*(?:~{3,}|`{3,}))[ ]* # opening fence
|
(?P<fence>^[ \t]*(?:~{3,}|`{3,}))[ ]* # opening fence
|
||||||
((\{(?P<attrs>[^\}\n]*)\})| # (optional {attrs} or
|
((\{(?P<attrs>[^\}\n]*)\})| # (optional {attrs} or
|
||||||
(\.?(?P<lang>[\w#.+-]*)[ ]*)? # optional (.)lang
|
(\.?(?P<lang>[\w#.+-]*)[ ]*)? # optional (.)lang
|
||||||
@ -162,16 +164,17 @@ FENCED_BLOCK_RE = re.compile(
|
|||||||
\n # newline (end of opening fence)
|
\n # newline (end of opening fence)
|
||||||
(?P<code>.*?)(?<=\n) # the code block
|
(?P<code>.*?)(?<=\n) # the code block
|
||||||
(?P=fence)[ ]*$ # closing fence
|
(?P=fence)[ ]*$ # closing fence
|
||||||
'''),
|
"""
|
||||||
re.MULTILINE | re.DOTALL | re.VERBOSE
|
),
|
||||||
|
re.MULTILINE | re.DOTALL | re.VERBOSE,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_line_range(re_match_obj, txt):
|
def get_line_range(re_match_obj, txt):
|
||||||
start_pos, end_pos = re_match_obj.regs[0]
|
start_pos, end_pos = re_match_obj.regs[0]
|
||||||
num_newlines_before = txt[:start_pos+1].count('\n')
|
num_newlines_before = txt[: start_pos + 1].count("\n")
|
||||||
line_start = num_newlines_before
|
line_start = num_newlines_before
|
||||||
line_end = num_newlines_before + txt[start_pos:end_pos].count('\n')+1
|
line_end = num_newlines_before + txt[start_pos:end_pos].count("\n") + 1
|
||||||
return line_start, line_end
|
return line_start, line_end
|
||||||
|
|
||||||
|
|
||||||
@ -181,14 +184,16 @@ def fix_code_segment_indent(txt):
|
|||||||
txt_tmp = txt
|
txt_tmp = txt
|
||||||
while True:
|
while True:
|
||||||
re_match_obj = FENCED_BLOCK_RE.search(txt_tmp)
|
re_match_obj = FENCED_BLOCK_RE.search(txt_tmp)
|
||||||
if not re_match_obj: break
|
if not re_match_obj:
|
||||||
if len(lines) == 0: lines = txt.split("\n")
|
break
|
||||||
|
if len(lines) == 0:
|
||||||
|
lines = txt.split("\n")
|
||||||
|
|
||||||
# 清空 txt_tmp 对应的位置方便下次搜索
|
# 清空 txt_tmp 对应的位置方便下次搜索
|
||||||
start_pos, end_pos = re_match_obj.regs[0]
|
start_pos, end_pos = re_match_obj.regs[0]
|
||||||
txt_tmp = txt_tmp[:start_pos] + ' '*(end_pos-start_pos) + txt_tmp[end_pos:]
|
txt_tmp = txt_tmp[:start_pos] + " " * (end_pos - start_pos) + txt_tmp[end_pos:]
|
||||||
line_start, line_end = get_line_range(re_match_obj, txt)
|
line_start, line_end = get_line_range(re_match_obj, txt)
|
||||||
|
|
||||||
# 获取公共缩进
|
# 获取公共缩进
|
||||||
shared_indent_cnt = 1e5
|
shared_indent_cnt = 1e5
|
||||||
for i in range(line_start, line_end):
|
for i in range(line_start, line_end):
|
||||||
@ -202,26 +207,26 @@ def fix_code_segment_indent(txt):
|
|||||||
num_spaces_should_be = math.ceil(shared_indent_cnt / 4) * 4
|
num_spaces_should_be = math.ceil(shared_indent_cnt / 4) * 4
|
||||||
for i in range(line_start, line_end):
|
for i in range(line_start, line_end):
|
||||||
add_n = num_spaces_should_be - shared_indent_cnt
|
add_n = num_spaces_should_be - shared_indent_cnt
|
||||||
lines[i] = ' ' * add_n + lines[i]
|
lines[i] = " " * add_n + lines[i]
|
||||||
if not change_any: # 遇到第一个
|
if not change_any: # 遇到第一个
|
||||||
change_any = True
|
change_any = True
|
||||||
|
|
||||||
if change_any:
|
if change_any:
|
||||||
return '\n'.join(lines)
|
return "\n".join(lines)
|
||||||
else:
|
else:
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=128) # 使用 lru缓存 加快转换速度
|
@lru_cache(maxsize=128) # 使用 lru缓存 加快转换速度
|
||||||
def markdown_convertion(txt):
|
def markdown_convertion(txt):
|
||||||
"""
|
"""
|
||||||
将Markdown格式的文本转换为HTML格式。如果包含数学公式,则先将公式转换为HTML格式。
|
将Markdown格式的文本转换为HTML格式。如果包含数学公式,则先将公式转换为HTML格式。
|
||||||
"""
|
"""
|
||||||
pre = '<div class="markdown-body">'
|
pre = '<div class="markdown-body">'
|
||||||
suf = '</div>'
|
suf = "</div>"
|
||||||
if txt.startswith(pre) and txt.endswith(suf):
|
if txt.startswith(pre) and txt.endswith(suf):
|
||||||
# print('警告,输入了已经经过转化的字符串,二次转化可能出问题')
|
# print('警告,输入了已经经过转化的字符串,二次转化可能出问题')
|
||||||
return txt # 已经被转化过,不需要再次转化
|
return txt # 已经被转化过,不需要再次转化
|
||||||
|
|
||||||
find_equation_pattern = r'<script type="math/tex(?:.*?)>(.*?)</script>'
|
find_equation_pattern = r'<script type="math/tex(?:.*?)>(.*?)</script>'
|
||||||
|
|
||||||
@ -229,18 +234,47 @@ def markdown_convertion(txt):
|
|||||||
# txt = fix_code_segment_indent(txt)
|
# txt = fix_code_segment_indent(txt)
|
||||||
if is_equation(txt): # 有$标识的公式符号,且没有代码段```的标识
|
if is_equation(txt): # 有$标识的公式符号,且没有代码段```的标识
|
||||||
# convert everything to html format
|
# convert everything to html format
|
||||||
split = markdown.markdown(text='---')
|
split = markdown.markdown(text="---")
|
||||||
convert_stage_1 = markdown.markdown(text=txt, extensions=['sane_lists', 'tables', 'mdx_math', 'pymdownx.superfences', 'pymdownx.highlight'],
|
convert_stage_1 = markdown.markdown(
|
||||||
extension_configs={**markdown_extension_configs, **code_highlight_configs})
|
text=txt,
|
||||||
|
extensions=[
|
||||||
|
"sane_lists",
|
||||||
|
"tables",
|
||||||
|
"mdx_math",
|
||||||
|
"pymdownx.superfences",
|
||||||
|
"pymdownx.highlight",
|
||||||
|
],
|
||||||
|
extension_configs={**markdown_extension_configs, **code_highlight_configs},
|
||||||
|
)
|
||||||
convert_stage_1 = markdown_bug_hunt(convert_stage_1)
|
convert_stage_1 = markdown_bug_hunt(convert_stage_1)
|
||||||
# 1. convert to easy-to-copy tex (do not render math)
|
# 1. convert to easy-to-copy tex (do not render math)
|
||||||
convert_stage_2_1, n = re.subn(find_equation_pattern, replace_math_no_render, convert_stage_1, flags=re.DOTALL)
|
convert_stage_2_1, n = re.subn(
|
||||||
|
find_equation_pattern,
|
||||||
|
replace_math_no_render,
|
||||||
|
convert_stage_1,
|
||||||
|
flags=re.DOTALL,
|
||||||
|
)
|
||||||
# 2. convert to rendered equation
|
# 2. convert to rendered equation
|
||||||
convert_stage_2_2, n = re.subn(find_equation_pattern, replace_math_render, convert_stage_1, flags=re.DOTALL)
|
convert_stage_2_2, n = re.subn(
|
||||||
|
find_equation_pattern, replace_math_render, convert_stage_1, flags=re.DOTALL
|
||||||
|
)
|
||||||
# cat them together
|
# cat them together
|
||||||
return pre + convert_stage_2_1 + f'{split}' + convert_stage_2_2 + suf
|
return pre + convert_stage_2_1 + f"{split}" + convert_stage_2_2 + suf
|
||||||
else:
|
else:
|
||||||
return pre + markdown.markdown(txt, extensions=['sane_lists', 'tables', 'pymdownx.superfences', 'pymdownx.highlight'], extension_configs=code_highlight_configs) + suf
|
return (
|
||||||
|
pre
|
||||||
|
+ markdown.markdown(
|
||||||
|
txt,
|
||||||
|
extensions=[
|
||||||
|
"sane_lists",
|
||||||
|
"tables",
|
||||||
|
"pymdownx.superfences",
|
||||||
|
"pymdownx.highlight",
|
||||||
|
],
|
||||||
|
extension_configs=code_highlight_configs,
|
||||||
|
)
|
||||||
|
+ suf
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def close_up_code_segment_during_stream(gpt_reply):
|
def close_up_code_segment_during_stream(gpt_reply):
|
||||||
@ -254,16 +288,16 @@ def close_up_code_segment_during_stream(gpt_reply):
|
|||||||
str: 返回一个新的字符串,将输出代码片段的“后面的```”补上。
|
str: 返回一个新的字符串,将输出代码片段的“后面的```”补上。
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if '```' not in gpt_reply:
|
if "```" not in gpt_reply:
|
||||||
return gpt_reply
|
return gpt_reply
|
||||||
if gpt_reply.endswith('```'):
|
if gpt_reply.endswith("```"):
|
||||||
return gpt_reply
|
return gpt_reply
|
||||||
|
|
||||||
# 排除了以上两个情况,我们
|
# 排除了以上两个情况,我们
|
||||||
segments = gpt_reply.split('```')
|
segments = gpt_reply.split("```")
|
||||||
n_mark = len(segments) - 1
|
n_mark = len(segments) - 1
|
||||||
if n_mark % 2 == 1:
|
if n_mark % 2 == 1:
|
||||||
return gpt_reply + '\n```' # 输出代码片段中!
|
return gpt_reply + "\n```" # 输出代码片段中!
|
||||||
else:
|
else:
|
||||||
return gpt_reply
|
return gpt_reply
|
||||||
|
|
||||||
@ -275,13 +309,23 @@ def format_io(self, y):
|
|||||||
if y is None or y == []:
|
if y is None or y == []:
|
||||||
return []
|
return []
|
||||||
i_ask, gpt_reply = y[-1]
|
i_ask, gpt_reply = y[-1]
|
||||||
|
i_ask = apply_gpt_academic_string_mask(i_ask, mode="show_render")
|
||||||
|
gpt_reply = apply_gpt_academic_string_mask(gpt_reply, mode="show_render")
|
||||||
# 输入部分太自由,预处理一波
|
# 输入部分太自由,预处理一波
|
||||||
if i_ask is not None: i_ask = text_divide_paragraph(i_ask)
|
if i_ask is not None:
|
||||||
|
i_ask = text_divide_paragraph(i_ask)
|
||||||
# 当代码输出半截的时候,试着补上后个```
|
# 当代码输出半截的时候,试着补上后个```
|
||||||
if gpt_reply is not None: gpt_reply = close_up_code_segment_during_stream(gpt_reply)
|
if gpt_reply is not None:
|
||||||
# process
|
gpt_reply = close_up_code_segment_during_stream(gpt_reply)
|
||||||
|
# 处理提问与输出
|
||||||
y[-1] = (
|
y[-1] = (
|
||||||
None if i_ask is None else markdown.markdown(i_ask, extensions=['pymdownx.superfences', 'tables', 'pymdownx.highlight'], extension_configs=code_highlight_configs),
|
None
|
||||||
None if gpt_reply is None else markdown_convertion(gpt_reply)
|
if i_ask is None
|
||||||
|
else markdown.markdown(
|
||||||
|
i_ask,
|
||||||
|
extensions=["pymdownx.superfences", "tables", "pymdownx.highlight"],
|
||||||
|
extension_configs=code_highlight_configs,
|
||||||
|
),
|
||||||
|
None if gpt_reply is None else markdown_convertion(gpt_reply),
|
||||||
)
|
)
|
||||||
return y
|
return y
|
||||||
|
56
shared_utils/text_mask.py
Normal file
56
shared_utils/text_mask.py
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
import re
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
|
# 这段代码是使用Python编程语言中的re模块,即正则表达式库,来定义了一个正则表达式模式。
|
||||||
|
# 这个模式被编译成一个正则表达式对象,存储在名为const_extract_exp的变量中,以便于后续快速的匹配和查找操作。
|
||||||
|
# 这里解释一下正则表达式中的几个特殊字符:
|
||||||
|
# - . 表示任意单一字符。
|
||||||
|
# - * 表示前一个字符可以出现0次或多次。
|
||||||
|
# - ? 在这里用作非贪婪匹配,也就是说它会匹配尽可能少的字符。在(.*?)中,它确保我们匹配的任意文本是尽可能短的,也就是说,它会在</show_llm>和</show_render>标签之前停止匹配。
|
||||||
|
# - () 括号在正则表达式中表示捕获组。
|
||||||
|
# - 在这个例子中,(.*?)表示捕获任意长度的文本,直到遇到括号外部最近的限定符,即</show_llm>和</show_render>。
|
||||||
|
|
||||||
|
# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=/1=-=-=-=-=-=-=-=-=-=-=-=-=-=/2-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||||
|
const_extract_re = re.compile(
|
||||||
|
r"<gpt_academic_string_mask><show_llm>(.*?)</show_llm><show_render>(.*?)</show_render></gpt_academic_string_mask>"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=128)
|
||||||
|
def apply_gpt_academic_string_mask(string, mode="show_all"):
|
||||||
|
"""
|
||||||
|
根据字符串要给谁看(大模型,还是web渲染),对字符串进行处理,返回处理后的字符串
|
||||||
|
示意图:https://mermaid.live/edit#pako:eNqlkUtLw0AUhf9KuOta0iaTplkIPlpduFJwoZEwJGNbzItpita2O6tF8QGKogXFtwu7cSHiq3-mk_oznFR8IYLgrGbuOd9hDrcCpmcR0GDW9ubNPKaBMDauuwI_A9M6YN-3y0bODwxsYos4BdMoBrTg5gwHF-d0mBH6-vqFQe58ed5m9XPW2uteX3Tubrj0ljLYcwxxR3h1zB43WeMs3G19yEM9uapDMe_NG9i2dagKw1Fee4c1D9nGEbtc-5n6HbNtJ8IyHOs8tbs7V2HrlDX2w2Y7XD_5haHEtQiNsOwfMVa_7TzsvrWIuJGo02qTrdwLk9gukQylHv3Afv1ML270s-HZUndrmW1tdA-WfvbM_jMFYuAQ6uCCxVdciTJ1CPLEITpo_GphypeouzXuw6XAmyi7JmgBLZEYlHwLB2S4gHMUO-9DH7tTnvf1CVoFFkBLSOk4QmlRTqpIlaWUHINyNFXjaQWpCYRURUKiWovBYo8X4ymEJFlECQUpqaQkJmuvWygPpg
|
||||||
|
"""
|
||||||
|
if mode == "show_all":
|
||||||
|
return string
|
||||||
|
if mode == "show_llm":
|
||||||
|
string = const_extract_re.sub(r"\1", string)
|
||||||
|
elif mode == "show_render":
|
||||||
|
string = const_extract_re.sub(r"\2", string)
|
||||||
|
else:
|
||||||
|
raise ValueError("Invalid mode")
|
||||||
|
return string
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=128)
|
||||||
|
def build_gpt_academic_masked_string(text_show_llm="", text_show_render=""):
|
||||||
|
"""
|
||||||
|
根据字符串要给谁看(大模型,还是web渲染),生成带掩码tag的字符串
|
||||||
|
"""
|
||||||
|
return f"<gpt_academic_string_mask><show_llm>{text_show_llm}</show_llm><show_render>{text_show_render}</show_render></gpt_academic_string_mask>"
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Test
|
||||||
|
input_string = (
|
||||||
|
"你好\n"
|
||||||
|
+ build_gpt_academic_masked_string(text_show_llm="mermaid", text_show_render="")
|
||||||
|
+ "你好\n"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
apply_gpt_academic_string_mask(input_string, "show_llm")
|
||||||
|
) # Should print the strings with 'abc' in place of the academic mask tags
|
||||||
|
print(
|
||||||
|
apply_gpt_academic_string_mask(input_string, "show_render")
|
||||||
|
) # Should print the strings with 'xyz' in place of the academic mask tags
|
@ -19,6 +19,8 @@ from shared_utils.connect_void_terminal import get_chat_handle
|
|||||||
from shared_utils.connect_void_terminal import get_plugin_handle
|
from shared_utils.connect_void_terminal import get_plugin_handle
|
||||||
from shared_utils.connect_void_terminal import get_plugin_default_kwargs
|
from shared_utils.connect_void_terminal import get_plugin_default_kwargs
|
||||||
from shared_utils.connect_void_terminal import get_chat_default_kwargs
|
from shared_utils.connect_void_terminal import get_chat_default_kwargs
|
||||||
|
from shared_utils.text_mask import apply_gpt_academic_string_mask
|
||||||
|
from shared_utils.text_mask import build_gpt_academic_masked_string
|
||||||
|
|
||||||
pj = os.path.join
|
pj = os.path.join
|
||||||
default_user_name = "default_user"
|
default_user_name = "default_user"
|
||||||
@ -67,7 +69,9 @@ class ChatBotWithCookies(list):
|
|||||||
|
|
||||||
def ArgsGeneralWrapper(f):
|
def ArgsGeneralWrapper(f):
|
||||||
"""
|
"""
|
||||||
装饰器函数,用于重组输入参数,改变输入参数的顺序与结构。
|
装饰器函数ArgsGeneralWrapper,用于重组输入参数,改变输入参数的顺序与结构。
|
||||||
|
该装饰器是大多数功能调用的入口。
|
||||||
|
函数示意图:https://mermaid.live/edit#pako:eNqNVFtPGkEY_StkntoEDQtLoTw0sWqapjQxVWPabmOm7AiEZZcsQ9QiiW012qixqdeqqIn10geBh6ZR8PJnmAWe-hc6l3VhrWnLEzNzzvnO953ZyYOYoSIQAWOaMR5LQBN7hvoU3UN_g5iu7imAXEyT4wUF3Pd0dT3y9KGYYUJsmK8V0GPGs0-QjkyojZgwk0Fm82C2dVghX08U8EaoOHjOfoEMU0XmADRhOksVWnNLjdpM82qFzB6S5Q_WWsUhuqCc3JtAsVR_OoMnhyZwXgHWwbS1d4gnsLVZJp-P6mfVxveqAgqC70Jz_pQCOGDKM5xFdNNPDdilF6uSU_hOYqu4a3MHYDZLDzq5fodrC3PWcEaFGPUaRiqJWK_W9g9rvRITa4dhy_0nw67SiePMp3oSR6PPn41DGgllkvkizYwsrmtaejTFd8V4yekGmT1zqrt4XGlAy8WTuiPULF01LksZvukSajfQQRAxmYi5S0D81sDcyzapVdn6sYFHkjhhGyel3frVQnvsnbR23lEjlhIlaOJiFPWzU5G4tfNJo8ejwp47-TbvJkKKZvmxA6SKo16oaazJysfG6klr9T0pbTW2ZqzlL_XaT8fYbQLXe4mSmvoCZXMaa7FePW6s7jVqK9bujvse3WFjY5_Z4KfsA4oiPY4T7Drvn1tLJTbG1to1qR79ulgk89-oJbvZzbIwJty6u20LOReWa9BvwserUd9s9MIKc3x5TUWEoAhUyJK5y85w_yG-dFu_R9waoU7K581y8W_qLle35-rG9Nxcrz8QHRsc0K-r9NViYRT36KsFvCCNzDRMqvSVyzOKAnACpZECIvSvCs2UAhS9QHEwh43BST0GItjMIS_I8e-sLwnj9A262cxA_ZVh0OUY1LJiDSJ5MAEiUijYLUtBORR6KElyQPaCSRDpksNSd8AfluSgHPaFC17wjrOlbgbzyyFf4IFPDvoD_sJvnkdK-g
|
||||||
"""
|
"""
|
||||||
def decorated(request: gradio.Request, cookies, max_length, llm_model, txt, txt2, top_p, temperature, chatbot, history, system_prompt, plugin_advanced_arg, *args):
|
def decorated(request: gradio.Request, cookies, max_length, llm_model, txt, txt2, top_p, temperature, chatbot, history, system_prompt, plugin_advanced_arg, *args):
|
||||||
txt_passon = txt
|
txt_passon = txt
|
||||||
|
Loading…
x
Reference in New Issue
Block a user