Merge branch 'new_langchain'

This commit is contained in:
binary-husky 2023-12-09 21:41:33 +08:00
commit 2f148bada0
11 changed files with 505 additions and 115 deletions

View File

@ -159,7 +159,15 @@ def warm_up_modules():
enc.encode("模块预热", disallowed_special=()) enc.encode("模块预热", disallowed_special=())
enc = model_info["gpt-4"]['tokenizer'] enc = model_info["gpt-4"]['tokenizer']
enc.encode("模块预热", disallowed_special=()) enc.encode("模块预热", disallowed_special=())
def warm_up_vectordb():
print('正在执行一些模块的预热 ...')
from toolbox import ProxyNetworkActivate
with ProxyNetworkActivate("Warmup_Modules"):
import nltk
with ProxyNetworkActivate("Warmup_Modules"): nltk.download("punkt")
if __name__ == '__main__': if __name__ == '__main__':
import os import os
os.environ['no_proxy'] = '*' # 避免代理网络产生意外污染 os.environ['no_proxy'] = '*' # 避免代理网络产生意外污染

View File

@ -440,7 +440,7 @@ def get_crazy_functions():
print('Load function plugin failed') print('Load function plugin failed')
try: try:
from crazy_functions.Langchain知识库 import 知识库问答 from crazy_functions.知识库问答 import 知识库文件注入
function_plugins.update({ function_plugins.update({
"构建知识库(先上传文件素材,再运行此插件)": { "构建知识库(先上传文件素材,再运行此插件)": {
"Group": "对话", "Group": "对话",
@ -448,7 +448,7 @@ def get_crazy_functions():
"AsButton": False, "AsButton": False,
"AdvancedArgs": True, "AdvancedArgs": True,
"ArgsReminder": "此处待注入的知识库名称id, 默认为default。文件进入知识库后可长期保存。可以通过再次调用本插件的方式向知识库追加更多文档。", "ArgsReminder": "此处待注入的知识库名称id, 默认为default。文件进入知识库后可长期保存。可以通过再次调用本插件的方式向知识库追加更多文档。",
"Function": HotReload(知识库问答) "Function": HotReload(知识库文件注入)
} }
}) })
except: except:
@ -456,9 +456,9 @@ def get_crazy_functions():
print('Load function plugin failed') print('Load function plugin failed')
try: try:
from crazy_functions.Langchain知识库 import 读取知识库作答 from crazy_functions.知识库问答 import 读取知识库作答
function_plugins.update({ function_plugins.update({
"知识库问答(构建知识库后,再运行此插件)": { "知识库文件注入(构建知识库后,再运行此插件)": {
"Group": "对话", "Group": "对话",
"Color": "stop", "Color": "stop",
"AsButton": False, "AsButton": False,

View File

@ -1,4 +1,4 @@
from toolbox import update_ui, get_conf, trimmed_format_exc, get_max_token from toolbox import update_ui, get_conf, trimmed_format_exc, get_max_token, Singleton
import threading import threading
import os import os
import logging import logging
@ -631,89 +631,6 @@ def get_files_from_everything(txt, type): # type='.md'
def Singleton(cls):
_instance = {}
def _singleton(*args, **kargs):
if cls not in _instance:
_instance[cls] = cls(*args, **kargs)
return _instance[cls]
return _singleton
@Singleton
class knowledge_archive_interface():
def __init__(self) -> None:
self.threadLock = threading.Lock()
self.current_id = ""
self.kai_path = None
self.qa_handle = None
self.text2vec_large_chinese = None
def get_chinese_text2vec(self):
if self.text2vec_large_chinese is None:
# < -------------------预热文本向量化模组--------------- >
from toolbox import ProxyNetworkActivate
print('Checking Text2vec ...')
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
with ProxyNetworkActivate('Download_LLM'): # 临时地激活代理网络
self.text2vec_large_chinese = HuggingFaceEmbeddings(model_name="GanymedeNil/text2vec-large-chinese")
return self.text2vec_large_chinese
def feed_archive(self, file_manifest, id="default"):
self.threadLock.acquire()
# import uuid
self.current_id = id
from zh_langchain import construct_vector_store
self.qa_handle, self.kai_path = construct_vector_store(
vs_id=self.current_id,
files=file_manifest,
sentence_size=100,
history=[],
one_conent="",
one_content_segmentation="",
text2vec = self.get_chinese_text2vec(),
)
self.threadLock.release()
def get_current_archive_id(self):
return self.current_id
def get_loaded_file(self):
return self.qa_handle.get_loaded_file()
def answer_with_archive_by_id(self, txt, id):
self.threadLock.acquire()
if not self.current_id == id:
self.current_id = id
from zh_langchain import construct_vector_store
self.qa_handle, self.kai_path = construct_vector_store(
vs_id=self.current_id,
files=[],
sentence_size=100,
history=[],
one_conent="",
one_content_segmentation="",
text2vec = self.get_chinese_text2vec(),
)
VECTOR_SEARCH_SCORE_THRESHOLD = 0
VECTOR_SEARCH_TOP_K = 4
CHUNK_SIZE = 512
resp, prompt = self.qa_handle.get_knowledge_based_conent_test(
query = txt,
vs_path = self.kai_path,
score_threshold=VECTOR_SEARCH_SCORE_THRESHOLD,
vector_search_top_k=VECTOR_SEARCH_TOP_K,
chunk_conent=True,
chunk_size=CHUNK_SIZE,
text2vec = self.get_chinese_text2vec(),
)
self.threadLock.release()
return resp, prompt
@Singleton @Singleton
class nougat_interface(): class nougat_interface():

View File

View File

@ -0,0 +1,70 @@
# From project chatglm-langchain
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
import re
from typing import List
class ChineseTextSplitter(CharacterTextSplitter):
def __init__(self, pdf: bool = False, sentence_size: int = None, **kwargs):
super().__init__(**kwargs)
self.pdf = pdf
self.sentence_size = sentence_size
def split_text1(self, text: str) -> List[str]:
if self.pdf:
text = re.sub(r"\n{3,}", "\n", text)
text = re.sub('\s', ' ', text)
text = text.replace("\n\n", "")
sent_sep_pattern = re.compile('([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') # del
sent_list = []
for ele in sent_sep_pattern.split(text):
if sent_sep_pattern.match(ele) and sent_list:
sent_list[-1] += ele
elif ele:
sent_list.append(ele)
return sent_list
def split_text(self, text: str) -> List[str]: ##此处需要进一步优化逻辑
if self.pdf:
text = re.sub(r"\n{3,}", r"\n", text)
text = re.sub('\s', " ", text)
text = re.sub("\n\n", "", text)
text = re.sub(r'([;.!?。!?\?])([^”’])', r"\1\n\2", text) # 单字符断句符
text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text) # 英文省略号
text = re.sub(r'(\{2})([^"’”」』])', r"\1\n\2", text) # 中文省略号
text = re.sub(r'([;!?。!?\?]["’”」』]{0,2})([^;!?,。!?\?])', r'\1\n\2', text)
# 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后注意前面的几句都小心保留了双引号
text = text.rstrip() # 段尾如果有多余的\n就去掉它
# 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。
ls = [i for i in text.split("\n") if i]
for ele in ls:
if len(ele) > self.sentence_size:
ele1 = re.sub(r'([,.]["’”」』]{0,2})([^,.])', r'\1\n\2', ele)
ele1_ls = ele1.split("\n")
for ele_ele1 in ele1_ls:
if len(ele_ele1) > self.sentence_size:
ele_ele2 = re.sub(r'([\n]{1,}| {2,}["’”」』]{0,2})([^\s])', r'\1\n\2', ele_ele1)
ele2_ls = ele_ele2.split("\n")
for ele_ele2 in ele2_ls:
if len(ele_ele2) > self.sentence_size:
ele_ele3 = re.sub('( ["’”」』]{0,2})([^ ])', r'\1\n\2', ele_ele2)
ele2_id = ele2_ls.index(ele_ele2)
ele2_ls = ele2_ls[:ele2_id] + [i for i in ele_ele3.split("\n") if i] + ele2_ls[
ele2_id + 1:]
ele_id = ele1_ls.index(ele_ele1)
ele1_ls = ele1_ls[:ele_id] + [i for i in ele2_ls if i] + ele1_ls[ele_id + 1:]
id = ls.index(ele)
ls = ls[:id] + [i for i in ele1_ls if i] + ls[id + 1:]
return ls
def load_file(filepath, sentence_size):
loader = UnstructuredFileLoader(filepath, mode="elements")
textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
docs = loader.load_and_split(text_splitter=textsplitter)
# write_check_file(filepath, docs)
return docs

View File

@ -0,0 +1,338 @@
# From project chatglm-langchain
import threading
from toolbox import Singleton
import os
import shutil
import os
import uuid
import tqdm
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
from typing import List, Tuple
import numpy as np
from crazy_functions.vector_fns.general_file_loader import load_file
embedding_model_dict = {
"ernie-tiny": "nghuyong/ernie-3.0-nano-zh",
"ernie-base": "nghuyong/ernie-3.0-base-zh",
"text2vec-base": "shibing624/text2vec-base-chinese",
"text2vec": "GanymedeNil/text2vec-large-chinese",
}
# Embedding model name
EMBEDDING_MODEL = "text2vec"
# Embedding running device
EMBEDDING_DEVICE = "cpu"
# 基于上下文的prompt模版请务必保留"{question}"和"{context}"
PROMPT_TEMPLATE = """已知信息:
{context}
根据上述已知信息简洁和专业的来回答用户的问题如果无法从中得到答案请说 根据已知信息无法回答该问题 没有提供足够的相关信息不允许在答案中添加编造成分答案请使用中文 问题是{question}"""
# 文本分句长度
SENTENCE_SIZE = 100
# 匹配后单段上下文长度
CHUNK_SIZE = 250
# LLM input history length
LLM_HISTORY_LEN = 3
# return top-k text chunk from vector store
VECTOR_SEARCH_TOP_K = 5
# 知识检索内容相关度 Score, 数值范围约为0-1100如果为0则不生效经测试设置为小于500时匹配结果更精准
VECTOR_SEARCH_SCORE_THRESHOLD = 0
NLTK_DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "nltk_data")
FLAG_USER_NAME = uuid.uuid4().hex
# 是否开启跨域默认为False如果需要开启请设置为True
# is open cross domain
OPEN_CROSS_DOMAIN = False
def similarity_search_with_score_by_vector(
self, embedding: List[float], k: int = 4
) -> List[Tuple[Document, float]]:
def seperate_list(ls: List[int]) -> List[List[int]]:
lists = []
ls1 = [ls[0]]
for i in range(1, len(ls)):
if ls[i - 1] + 1 == ls[i]:
ls1.append(ls[i])
else:
lists.append(ls1)
ls1 = [ls[i]]
lists.append(ls1)
return lists
scores, indices = self.index.search(np.array([embedding], dtype=np.float32), k)
docs = []
id_set = set()
store_len = len(self.index_to_docstore_id)
for j, i in enumerate(indices[0]):
if i == -1 or 0 < self.score_threshold < scores[0][j]:
# This happens when not enough docs are returned.
continue
_id = self.index_to_docstore_id[i]
doc = self.docstore.search(_id)
if not self.chunk_conent:
if not isinstance(doc, Document):
raise ValueError(f"Could not find document for id {_id}, got {doc}")
doc.metadata["score"] = int(scores[0][j])
docs.append(doc)
continue
id_set.add(i)
docs_len = len(doc.page_content)
for k in range(1, max(i, store_len - i)):
break_flag = False
for l in [i + k, i - k]:
if 0 <= l < len(self.index_to_docstore_id):
_id0 = self.index_to_docstore_id[l]
doc0 = self.docstore.search(_id0)
if docs_len + len(doc0.page_content) > self.chunk_size:
break_flag = True
break
elif doc0.metadata["source"] == doc.metadata["source"]:
docs_len += len(doc0.page_content)
id_set.add(l)
if break_flag:
break
if not self.chunk_conent:
return docs
if len(id_set) == 0 and self.score_threshold > 0:
return []
id_list = sorted(list(id_set))
id_lists = seperate_list(id_list)
for id_seq in id_lists:
for id in id_seq:
if id == id_seq[0]:
_id = self.index_to_docstore_id[id]
doc = self.docstore.search(_id)
else:
_id0 = self.index_to_docstore_id[id]
doc0 = self.docstore.search(_id0)
doc.page_content += " " + doc0.page_content
if not isinstance(doc, Document):
raise ValueError(f"Could not find document for id {_id}, got {doc}")
doc_score = min([scores[0][id] for id in [indices[0].tolist().index(i) for i in id_seq if i in indices[0]]])
doc.metadata["score"] = int(doc_score)
docs.append(doc)
return docs
class LocalDocQA:
llm: object = None
embeddings: object = None
top_k: int = VECTOR_SEARCH_TOP_K
chunk_size: int = CHUNK_SIZE
chunk_conent: bool = True
score_threshold: int = VECTOR_SEARCH_SCORE_THRESHOLD
def init_cfg(self,
top_k=VECTOR_SEARCH_TOP_K,
):
self.llm = None
self.top_k = top_k
def init_knowledge_vector_store(self,
filepath,
vs_path: str or os.PathLike = None,
sentence_size=SENTENCE_SIZE,
text2vec=None):
loaded_files = []
failed_files = []
if isinstance(filepath, str):
if not os.path.exists(filepath):
print("路径不存在")
return None
elif os.path.isfile(filepath):
file = os.path.split(filepath)[-1]
try:
docs = load_file(filepath, SENTENCE_SIZE)
print(f"{file} 已成功加载")
loaded_files.append(filepath)
except Exception as e:
print(e)
print(f"{file} 未能成功加载")
return None
elif os.path.isdir(filepath):
docs = []
for file in tqdm(os.listdir(filepath), desc="加载文件"):
fullfilepath = os.path.join(filepath, file)
try:
docs += load_file(fullfilepath, SENTENCE_SIZE)
loaded_files.append(fullfilepath)
except Exception as e:
print(e)
failed_files.append(file)
if len(failed_files) > 0:
print("以下文件未能成功加载:")
for file in failed_files:
print(f"{file}\n")
else:
docs = []
for file in filepath:
docs += load_file(file, SENTENCE_SIZE)
print(f"{file} 已成功加载")
loaded_files.append(file)
if len(docs) > 0:
print("文件加载完毕,正在生成向量库")
if vs_path and os.path.isdir(vs_path):
try:
self.vector_store = FAISS.load_local(vs_path, text2vec)
self.vector_store.add_documents(docs)
except:
self.vector_store = FAISS.from_documents(docs, text2vec)
else:
self.vector_store = FAISS.from_documents(docs, text2vec) # docs 为Document列表
self.vector_store.save_local(vs_path)
return vs_path, loaded_files
else:
raise RuntimeError("文件加载失败,请检查文件格式是否正确")
def get_loaded_file(self, vs_path):
ds = self.vector_store.docstore
return set([ds._dict[k].metadata['source'].split(vs_path)[-1] for k in ds._dict])
# query 查询内容
# vs_path 知识库路径
# chunk_conent 是否启用上下文关联
# score_threshold 搜索匹配score阈值
# vector_search_top_k 搜索知识库内容条数默认搜索5条结果
# chunk_sizes 匹配单段内容的连接上下文长度
def get_knowledge_based_conent_test(self, query, vs_path, chunk_conent,
score_threshold=VECTOR_SEARCH_SCORE_THRESHOLD,
vector_search_top_k=VECTOR_SEARCH_TOP_K, chunk_size=CHUNK_SIZE,
text2vec=None):
self.vector_store = FAISS.load_local(vs_path, text2vec)
self.vector_store.chunk_conent = chunk_conent
self.vector_store.score_threshold = score_threshold
self.vector_store.chunk_size = chunk_size
embedding = self.vector_store.embedding_function.embed_query(query)
related_docs_with_score = similarity_search_with_score_by_vector(self.vector_store, embedding, k=vector_search_top_k)
if not related_docs_with_score:
response = {"query": query,
"source_documents": []}
return response, ""
# prompt = f"{query}. You should answer this question using information from following documents: \n\n"
prompt = f"{query}. 你必须利用以下文档中包含的信息回答这个问题: \n\n---\n\n"
prompt += "\n\n".join([f"({k}): " + doc.page_content for k, doc in enumerate(related_docs_with_score)])
prompt += "\n\n---\n\n"
prompt = prompt.encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars
# print(prompt)
response = {"query": query, "source_documents": related_docs_with_score}
return response, prompt
def construct_vector_store(vs_id, vs_path, files, sentence_size, history, one_conent, one_content_segmentation, text2vec):
for file in files:
assert os.path.exists(file), "输入文件不存在:" + file
import nltk
if NLTK_DATA_PATH not in nltk.data.path: nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
local_doc_qa = LocalDocQA()
local_doc_qa.init_cfg()
filelist = []
if not os.path.exists(os.path.join(vs_path, vs_id)):
os.makedirs(os.path.join(vs_path, vs_id))
for file in files:
file_name = file.name if not isinstance(file, str) else file
filename = os.path.split(file_name)[-1]
shutil.copyfile(file_name, os.path.join(vs_path, vs_id, filename))
filelist.append(os.path.join(vs_path, vs_id, filename))
vs_path, loaded_files = local_doc_qa.init_knowledge_vector_store(filelist, os.path.join(vs_path, vs_id), sentence_size, text2vec)
if len(loaded_files):
file_status = f"已添加 {''.join([os.path.split(i)[-1] for i in loaded_files if i])} 内容至知识库,并已加载知识库,请开始提问"
else:
pass
# file_status = "文件未成功加载,请重新上传文件"
# print(file_status)
return local_doc_qa, vs_path
@Singleton
class knowledge_archive_interface():
def __init__(self) -> None:
self.threadLock = threading.Lock()
self.current_id = ""
self.kai_path = None
self.qa_handle = None
self.text2vec_large_chinese = None
def get_chinese_text2vec(self):
if self.text2vec_large_chinese is None:
# < -------------------预热文本向量化模组--------------- >
from toolbox import ProxyNetworkActivate
print('Checking Text2vec ...')
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
with ProxyNetworkActivate('Download_LLM'): # 临时地激活代理网络
self.text2vec_large_chinese = HuggingFaceEmbeddings(model_name="GanymedeNil/text2vec-large-chinese")
return self.text2vec_large_chinese
def feed_archive(self, file_manifest, vs_path, id="default"):
self.threadLock.acquire()
# import uuid
self.current_id = id
self.qa_handle, self.kai_path = construct_vector_store(
vs_id=self.current_id,
vs_path=vs_path,
files=file_manifest,
sentence_size=100,
history=[],
one_conent="",
one_content_segmentation="",
text2vec = self.get_chinese_text2vec(),
)
self.threadLock.release()
def get_current_archive_id(self):
return self.current_id
def get_loaded_file(self, vs_path):
return self.qa_handle.get_loaded_file(vs_path)
def answer_with_archive_by_id(self, txt, id, vs_path):
self.threadLock.acquire()
if not self.current_id == id:
self.current_id = id
self.qa_handle, self.kai_path = construct_vector_store(
vs_id=self.current_id,
vs_path=vs_path,
files=[],
sentence_size=100,
history=[],
one_conent="",
one_content_segmentation="",
text2vec = self.get_chinese_text2vec(),
)
VECTOR_SEARCH_SCORE_THRESHOLD = 0
VECTOR_SEARCH_TOP_K = 4
CHUNK_SIZE = 512
resp, prompt = self.qa_handle.get_knowledge_based_conent_test(
query = txt,
vs_path = self.kai_path,
score_threshold=VECTOR_SEARCH_SCORE_THRESHOLD,
vector_search_top_k=VECTOR_SEARCH_TOP_K,
chunk_conent=True,
chunk_size=CHUNK_SIZE,
text2vec = self.get_chinese_text2vec(),
)
self.threadLock.release()
return resp, prompt

View File

@ -1,10 +1,19 @@
from toolbox import CatchException, update_ui, ProxyNetworkActivate, update_ui_lastest_msg from toolbox import CatchException, update_ui, ProxyNetworkActivate, update_ui_lastest_msg, get_log_folder, get_user
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive, get_files_from_everything from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive, get_files_from_everything
install_msg ="""
1. python -m pip install torch --index-url https://download.pytorch.org/whl/cpu
2. python -m pip install transformers protobuf langchain sentence-transformers faiss-cpu nltk beautifulsoup4 bitsandbytes tabulate icetk --upgrade
3. python -m pip install unstructured[all-docs] --upgrade
4. python -c 'import nltk; nltk.download("punkt")'
"""
@CatchException @CatchException
def 知识库问答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): def 知识库文件注入(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
""" """
txt 输入栏用户输入的文本例如需要翻译的一段话再例如一个包含了待处理文件的路径 txt 输入栏用户输入的文本例如需要翻译的一段话再例如一个包含了待处理文件的路径
llm_kwargs gpt模型参数, 如温度和top_p等, 一般原样传递下去就行 llm_kwargs gpt模型参数, 如温度和top_p等, 一般原样传递下去就行
@ -25,15 +34,15 @@ def 知识库问答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_pro
# resolve deps # resolve deps
try: try:
from zh_langchain import construct_vector_store # from zh_langchain import construct_vector_store
from langchain.embeddings.huggingface import HuggingFaceEmbeddings # from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from .crazy_utils import knowledge_archive_interface from crazy_functions.vector_fns.vector_database import knowledge_archive_interface
except Exception as e: except Exception as e:
chatbot.append(["依赖不足", "导入依赖失败。正在尝试自动安装,请查看终端的输出或耐心等待..."]) chatbot.append(["依赖不足", f"{str(e)}\n\n导入依赖失败。请用以下命令安装" + install_msg])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
from .crazy_utils import try_install_deps # from .crazy_utils import try_install_deps
try_install_deps(['zh_langchain==0.2.1', 'pypinyin'], reload_m=['pypinyin', 'zh_langchain']) # try_install_deps(['zh_langchain==0.2.1', 'pypinyin'], reload_m=['pypinyin', 'zh_langchain'])
yield from update_ui_lastest_msg("安装完成,您可以再次重试。", chatbot, history) # yield from update_ui_lastest_msg("安装完成,您可以再次重试。", chatbot, history)
return return
# < --------------------读取文件--------------- > # < --------------------读取文件--------------- >
@ -42,7 +51,7 @@ def 知识库问答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_pro
for sp in spl: for sp in spl:
_, file_manifest_tmp, _ = get_files_from_everything(txt, type=f'.{sp}') _, file_manifest_tmp, _ = get_files_from_everything(txt, type=f'.{sp}')
file_manifest += file_manifest_tmp file_manifest += file_manifest_tmp
if len(file_manifest) == 0: if len(file_manifest) == 0:
chatbot.append(["没有找到任何可读取文件", "当前支持的格式包括: txt, md, docx, pptx, pdf, json等"]) chatbot.append(["没有找到任何可读取文件", "当前支持的格式包括: txt, md, docx, pptx, pdf, json等"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
@ -62,13 +71,14 @@ def 知识库问答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_pro
print('Establishing knowledge archive ...') print('Establishing knowledge archive ...')
with ProxyNetworkActivate('Download_LLM'): # 临时地激活代理网络 with ProxyNetworkActivate('Download_LLM'): # 临时地激活代理网络
kai = knowledge_archive_interface() kai = knowledge_archive_interface()
kai.feed_archive(file_manifest=file_manifest, id=kai_id) vs_path = get_log_folder(user=get_user(chatbot), plugin_name='vec_store')
kai_files = kai.get_loaded_file() kai.feed_archive(file_manifest=file_manifest, vs_path=vs_path, id=kai_id)
kai_files = kai.get_loaded_file(vs_path=vs_path)
kai_files = '<br/>'.join(kai_files) kai_files = '<br/>'.join(kai_files)
# chatbot.append(['知识库构建成功', "正在将知识库存储至cookie中"]) # chatbot.append(['知识库构建成功', "正在将知识库存储至cookie中"])
# yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
# chatbot._cookies['langchain_plugin_embedding'] = kai.get_current_archive_id() # chatbot._cookies['langchain_plugin_embedding'] = kai.get_current_archive_id()
# chatbot._cookies['lock_plugin'] = 'crazy_functions.Langchain知识库->读取知识库作答' # chatbot._cookies['lock_plugin'] = 'crazy_functions.知识库文件注入->读取知识库作答'
# chatbot.append(['完成', "“根据知识库作答”函数插件已经接管问答系统, 提问吧! 但注意, 您接下来不能再使用其他插件了,刷新页面即可以退出知识库问答模式。"]) # chatbot.append(['完成', "“根据知识库作答”函数插件已经接管问答系统, 提问吧! 但注意, 您接下来不能再使用其他插件了,刷新页面即可以退出知识库问答模式。"])
chatbot.append(['构建完成', f"当前知识库内的有效文件:\n\n---\n\n{kai_files}\n\n---\n\n请切换至“知识库问答”插件进行知识库访问, 或者使用此插件继续上传更多文件。"]) chatbot.append(['构建完成', f"当前知识库内的有效文件:\n\n---\n\n{kai_files}\n\n---\n\n请切换至“知识库问答”插件进行知识库访问, 或者使用此插件继续上传更多文件。"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间我们先及时地做一次界面更新 yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间我们先及时地做一次界面更新
@ -77,15 +87,15 @@ def 知识库问答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_pro
def 读取知识库作答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port=-1): def 读取知识库作答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port=-1):
# resolve deps # resolve deps
try: try:
from zh_langchain import construct_vector_store # from zh_langchain import construct_vector_store
from langchain.embeddings.huggingface import HuggingFaceEmbeddings # from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from .crazy_utils import knowledge_archive_interface from crazy_functions.vector_fns.vector_database import knowledge_archive_interface
except Exception as e: except Exception as e:
chatbot.append(["依赖不足", "导入依赖失败。正在尝试自动安装,请查看终端的输出或耐心等待..."]) chatbot.append(["依赖不足", f"{str(e)}\n\n导入依赖失败。请用以下命令安装" + install_msg])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
from .crazy_utils import try_install_deps # from .crazy_utils import try_install_deps
try_install_deps(['zh_langchain==0.2.1', 'pypinyin'], reload_m=['pypinyin', 'zh_langchain']) # try_install_deps(['zh_langchain==0.2.1', 'pypinyin'], reload_m=['pypinyin', 'zh_langchain'])
yield from update_ui_lastest_msg("安装完成,您可以再次重试。", chatbot, history) # yield from update_ui_lastest_msg("安装完成,您可以再次重试。", chatbot, history)
return return
# < ------------------- --------------- > # < ------------------- --------------- >
@ -93,7 +103,8 @@ def 读取知识库作答(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst
if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg") if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg")
kai_id = plugin_kwargs.get("advanced_arg", 'default') kai_id = plugin_kwargs.get("advanced_arg", 'default')
resp, prompt = kai.answer_with_archive_by_id(txt, kai_id) vs_path = get_log_folder(user=get_user(chatbot), plugin_name='vec_store')
resp, prompt = kai.answer_with_archive_by_id(txt, kai_id, vs_path)
chatbot.append((txt, f'[知识库 {kai_id}] ' + prompt)) chatbot.append((txt, f'[知识库 {kai_id}] ' + prompt))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间我们先及时地做一次界面更新 yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间我们先及时地做一次界面更新

View File

@ -0,0 +1,26 @@
# 此Dockerfile适用于“无本地模型”的环境构建如果需要使用chatglm等本地模型请参考 docs/Dockerfile+ChatGLM
# 如何构建: 先修改 `config.py` 然后 docker build -t gpt-academic-nolocal-vs -f docs/GithubAction+NoLocal+Vectordb .
# 如何运行: docker run --rm -it --net=host gpt-academic-nolocal-vs
FROM python:3.11
# 指定路径
WORKDIR /gpt
# 装载项目文件
COPY . .
# 安装依赖
RUN pip3 install -r requirements.txt
# 安装知识库插件的额外依赖
RUN apt-get update && apt-get install libgl1 -y
RUN pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cpu
RUN pip3 install transformers protobuf langchain sentence-transformers faiss-cpu nltk beautifulsoup4 bitsandbytes tabulate icetk --upgrade
RUN pip3 install unstructured[all-docs] --upgrade
# 可选步骤,用于预热模块
RUN python3 -c 'from check_proxy import warm_up_modules; warm_up_modules()'
RUN python3 -c 'from check_proxy import warm_up_vectordb; warm_up_vectordb()'
# 启动
CMD ["python3", "-u", "main.py"]

View File

@ -48,11 +48,11 @@ if __name__ == "__main__":
# for lang in ["English", "French", "Japanese", "Korean", "Russian", "Italian", "German", "Portuguese", "Arabic"]: # for lang in ["English", "French", "Japanese", "Korean", "Russian", "Italian", "German", "Portuguese", "Arabic"]:
# plugin_test(plugin='crazy_functions.批量Markdown翻译->Markdown翻译指定语言', main_input="README.md", advanced_arg={"advanced_arg": lang}) # plugin_test(plugin='crazy_functions.批量Markdown翻译->Markdown翻译指定语言', main_input="README.md", advanced_arg={"advanced_arg": lang})
# plugin_test(plugin='crazy_functions.Langchain知识库->知识库问答', main_input="./") # plugin_test(plugin='crazy_functions.知识库文件注入->知识库文件注入', main_input="./")
# plugin_test(plugin='crazy_functions.Langchain知识库->读取知识库作答', main_input="What is the installation method") # plugin_test(plugin='crazy_functions.知识库文件注入->读取知识库作答', main_input="What is the installation method")
# plugin_test(plugin='crazy_functions.Langchain知识库->读取知识库作答', main_input="远程云服务器部署?") # plugin_test(plugin='crazy_functions.知识库文件注入->读取知识库作答', main_input="远程云服务器部署?")
# plugin_test(plugin='crazy_functions.Latex输出PDF结果->Latex翻译中文并重新编译PDF', main_input="2210.03629") # plugin_test(plugin='crazy_functions.Latex输出PDF结果->Latex翻译中文并重新编译PDF', main_input="2210.03629")

View File

@ -56,11 +56,11 @@ vt.get_plugin_handle = silence_stdout_fn(get_plugin_handle)
vt.get_plugin_default_kwargs = silence_stdout_fn(get_plugin_default_kwargs) vt.get_plugin_default_kwargs = silence_stdout_fn(get_plugin_default_kwargs)
vt.get_chat_handle = silence_stdout_fn(get_chat_handle) vt.get_chat_handle = silence_stdout_fn(get_chat_handle)
vt.get_chat_default_kwargs = silence_stdout_fn(get_chat_default_kwargs) vt.get_chat_default_kwargs = silence_stdout_fn(get_chat_default_kwargs)
vt.chat_to_markdown_str = chat_to_markdown_str vt.chat_to_markdown_str = (chat_to_markdown_str)
proxies, WEB_PORT, LLM_MODEL, CONCURRENT_COUNT, AUTHENTICATION, CHATBOT_HEIGHT, LAYOUT, API_KEY = \ proxies, WEB_PORT, LLM_MODEL, CONCURRENT_COUNT, AUTHENTICATION, CHATBOT_HEIGHT, LAYOUT, API_KEY = \
vt.get_conf('proxies', 'WEB_PORT', 'LLM_MODEL', 'CONCURRENT_COUNT', 'AUTHENTICATION', 'CHATBOT_HEIGHT', 'LAYOUT', 'API_KEY') vt.get_conf('proxies', 'WEB_PORT', 'LLM_MODEL', 'CONCURRENT_COUNT', 'AUTHENTICATION', 'CHATBOT_HEIGHT', 'LAYOUT', 'API_KEY')
def plugin_test(main_input, plugin, advanced_arg=None): def plugin_test(main_input, plugin, advanced_arg=None, debug=True):
from rich.live import Live from rich.live import Live
from rich.markdown import Markdown from rich.markdown import Markdown
@ -72,7 +72,10 @@ def plugin_test(main_input, plugin, advanced_arg=None):
plugin_kwargs['main_input'] = main_input plugin_kwargs['main_input'] = main_input
if advanced_arg is not None: if advanced_arg is not None:
plugin_kwargs['plugin_kwargs'] = advanced_arg plugin_kwargs['plugin_kwargs'] = advanced_arg
my_working_plugin = silence_stdout(plugin)(**plugin_kwargs) if debug:
my_working_plugin = (plugin)(**plugin_kwargs)
else:
my_working_plugin = silence_stdout(plugin)(**plugin_kwargs)
with Live(Markdown(""), auto_refresh=False, vertical_overflow="visible") as live: with Live(Markdown(""), auto_refresh=False, vertical_overflow="visible") as live:
for cookies, chat, hist, msg in my_working_plugin: for cookies, chat, hist, msg in my_working_plugin:

View File

@ -0,0 +1,17 @@
"""
对项目中的各个插件进行测试运行方法直接运行 python tests/test_plugins.py
"""
import os, sys
def validate_path(): dir_name = os.path.dirname(__file__); root_dir_assume = os.path.abspath(dir_name + '/..'); os.chdir(root_dir_assume); sys.path.append(root_dir_assume)
validate_path() # 返回项目根路径
if __name__ == "__main__":
from tests.test_utils import plugin_test
plugin_test(plugin='crazy_functions.知识库问答->知识库文件注入', main_input="./README.md")
plugin_test(plugin='crazy_functions.知识库问答->读取知识库作答', main_input="What is the installation method")
plugin_test(plugin='crazy_functions.知识库问答->读取知识库作答', main_input="远程云服务器部署?")