diff --git a/check_proxy.py b/check_proxy.py index e2ba3f1..2df8185 100644 --- a/check_proxy.py +++ b/check_proxy.py @@ -159,7 +159,15 @@ def warm_up_modules(): enc.encode("模块预热", disallowed_special=()) enc = model_info["gpt-4"]['tokenizer'] enc.encode("模块预热", disallowed_special=()) + +def warm_up_vectordb(): + print('正在执行一些模块的预热 ...') + from toolbox import ProxyNetworkActivate + with ProxyNetworkActivate("Warmup_Modules"): + import nltk + with ProxyNetworkActivate("Warmup_Modules"): nltk.download("punkt") + if __name__ == '__main__': import os os.environ['no_proxy'] = '*' # 避免代理网络产生意外污染 diff --git a/crazy_functional.py b/crazy_functional.py index 31766f0..c8f626d 100644 --- a/crazy_functional.py +++ b/crazy_functional.py @@ -440,7 +440,7 @@ def get_crazy_functions(): print('Load function plugin failed') try: - from crazy_functions.Langchain知识库 import 知识库问答 + from crazy_functions.知识库问答 import 知识库文件注入 function_plugins.update({ "构建知识库(先上传文件素材,再运行此插件)": { "Group": "对话", @@ -448,7 +448,7 @@ def get_crazy_functions(): "AsButton": False, "AdvancedArgs": True, "ArgsReminder": "此处待注入的知识库名称id, 默认为default。文件进入知识库后可长期保存。可以通过再次调用本插件的方式,向知识库追加更多文档。", - "Function": HotReload(知识库问答) + "Function": HotReload(知识库文件注入) } }) except: @@ -456,9 +456,9 @@ def get_crazy_functions(): print('Load function plugin failed') try: - from crazy_functions.Langchain知识库 import 读取知识库作答 + from crazy_functions.知识库问答 import 读取知识库作答 function_plugins.update({ - "知识库问答(构建知识库后,再运行此插件)": { + "知识库文件注入(构建知识库后,再运行此插件)": { "Group": "对话", "Color": "stop", "AsButton": False, diff --git a/crazy_functions/crazy_utils.py b/crazy_functions/crazy_utils.py index afe079f..9778053 100644 --- a/crazy_functions/crazy_utils.py +++ b/crazy_functions/crazy_utils.py @@ -1,4 +1,4 @@ -from toolbox import update_ui, get_conf, trimmed_format_exc, get_max_token +from toolbox import update_ui, get_conf, trimmed_format_exc, get_max_token, Singleton import threading import os import logging @@ -631,89 +631,6 @@ def get_files_from_everything(txt, type): # type='.md' - -def Singleton(cls): - _instance = {} - - def _singleton(*args, **kargs): - if cls not in _instance: - _instance[cls] = cls(*args, **kargs) - return _instance[cls] - - return _singleton - - -@Singleton -class knowledge_archive_interface(): - def __init__(self) -> None: - self.threadLock = threading.Lock() - self.current_id = "" - self.kai_path = None - self.qa_handle = None - self.text2vec_large_chinese = None - - def get_chinese_text2vec(self): - if self.text2vec_large_chinese is None: - # < -------------------预热文本向量化模组--------------- > - from toolbox import ProxyNetworkActivate - print('Checking Text2vec ...') - from langchain.embeddings.huggingface import HuggingFaceEmbeddings - with ProxyNetworkActivate('Download_LLM'): # 临时地激活代理网络 - self.text2vec_large_chinese = HuggingFaceEmbeddings(model_name="GanymedeNil/text2vec-large-chinese") - - return self.text2vec_large_chinese - - - def feed_archive(self, file_manifest, id="default"): - self.threadLock.acquire() - # import uuid - self.current_id = id - from zh_langchain import construct_vector_store - self.qa_handle, self.kai_path = construct_vector_store( - vs_id=self.current_id, - files=file_manifest, - sentence_size=100, - history=[], - one_conent="", - one_content_segmentation="", - text2vec = self.get_chinese_text2vec(), - ) - self.threadLock.release() - - def get_current_archive_id(self): - return self.current_id - - def get_loaded_file(self): - return self.qa_handle.get_loaded_file() - - def answer_with_archive_by_id(self, txt, id): - self.threadLock.acquire() - if not self.current_id == id: - self.current_id = id - from zh_langchain import construct_vector_store - self.qa_handle, self.kai_path = construct_vector_store( - vs_id=self.current_id, - files=[], - sentence_size=100, - history=[], - one_conent="", - one_content_segmentation="", - text2vec = self.get_chinese_text2vec(), - ) - VECTOR_SEARCH_SCORE_THRESHOLD = 0 - VECTOR_SEARCH_TOP_K = 4 - CHUNK_SIZE = 512 - resp, prompt = self.qa_handle.get_knowledge_based_conent_test( - query = txt, - vs_path = self.kai_path, - score_threshold=VECTOR_SEARCH_SCORE_THRESHOLD, - vector_search_top_k=VECTOR_SEARCH_TOP_K, - chunk_conent=True, - chunk_size=CHUNK_SIZE, - text2vec = self.get_chinese_text2vec(), - ) - self.threadLock.release() - return resp, prompt @Singleton class nougat_interface(): diff --git a/crazy_functions/vector_fns/__init__.py b/crazy_functions/vector_fns/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/crazy_functions/vector_fns/general_file_loader.py b/crazy_functions/vector_fns/general_file_loader.py new file mode 100644 index 0000000..a512c48 --- /dev/null +++ b/crazy_functions/vector_fns/general_file_loader.py @@ -0,0 +1,70 @@ +# From project chatglm-langchain + + +from langchain.document_loaders import UnstructuredFileLoader +from langchain.text_splitter import CharacterTextSplitter +import re +from typing import List + +class ChineseTextSplitter(CharacterTextSplitter): + def __init__(self, pdf: bool = False, sentence_size: int = None, **kwargs): + super().__init__(**kwargs) + self.pdf = pdf + self.sentence_size = sentence_size + + def split_text1(self, text: str) -> List[str]: + if self.pdf: + text = re.sub(r"\n{3,}", "\n", text) + text = re.sub('\s', ' ', text) + text = text.replace("\n\n", "") + sent_sep_pattern = re.compile('([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') # del :; + sent_list = [] + for ele in sent_sep_pattern.split(text): + if sent_sep_pattern.match(ele) and sent_list: + sent_list[-1] += ele + elif ele: + sent_list.append(ele) + return sent_list + + def split_text(self, text: str) -> List[str]: ##此处需要进一步优化逻辑 + if self.pdf: + text = re.sub(r"\n{3,}", r"\n", text) + text = re.sub('\s', " ", text) + text = re.sub("\n\n", "", text) + + text = re.sub(r'([;;.!?。!?\?])([^”’])', r"\1\n\2", text) # 单字符断句符 + text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text) # 英文省略号 + text = re.sub(r'(\…{2})([^"’”」』])', r"\1\n\2", text) # 中文省略号 + text = re.sub(r'([;;!?。!?\?]["’”」』]{0,2})([^;;!?,。!?\?])', r'\1\n\2', text) + # 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后,注意前面的几句都小心保留了双引号 + text = text.rstrip() # 段尾如果有多余的\n就去掉它 + # 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。 + ls = [i for i in text.split("\n") if i] + for ele in ls: + if len(ele) > self.sentence_size: + ele1 = re.sub(r'([,,.]["’”」』]{0,2})([^,,.])', r'\1\n\2', ele) + ele1_ls = ele1.split("\n") + for ele_ele1 in ele1_ls: + if len(ele_ele1) > self.sentence_size: + ele_ele2 = re.sub(r'([\n]{1,}| {2,}["’”」』]{0,2})([^\s])', r'\1\n\2', ele_ele1) + ele2_ls = ele_ele2.split("\n") + for ele_ele2 in ele2_ls: + if len(ele_ele2) > self.sentence_size: + ele_ele3 = re.sub('( ["’”」』]{0,2})([^ ])', r'\1\n\2', ele_ele2) + ele2_id = ele2_ls.index(ele_ele2) + ele2_ls = ele2_ls[:ele2_id] + [i for i in ele_ele3.split("\n") if i] + ele2_ls[ + ele2_id + 1:] + ele_id = ele1_ls.index(ele_ele1) + ele1_ls = ele1_ls[:ele_id] + [i for i in ele2_ls if i] + ele1_ls[ele_id + 1:] + + id = ls.index(ele) + ls = ls[:id] + [i for i in ele1_ls if i] + ls[id + 1:] + return ls + +def load_file(filepath, sentence_size): + loader = UnstructuredFileLoader(filepath, mode="elements") + textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size) + docs = loader.load_and_split(text_splitter=textsplitter) + # write_check_file(filepath, docs) + return docs + diff --git a/crazy_functions/vector_fns/vector_database.py b/crazy_functions/vector_fns/vector_database.py new file mode 100644 index 0000000..cffa22c --- /dev/null +++ b/crazy_functions/vector_fns/vector_database.py @@ -0,0 +1,338 @@ +# From project chatglm-langchain + +import threading +from toolbox import Singleton +import os +import shutil +import os +import uuid +import tqdm +from langchain.vectorstores import FAISS +from langchain.docstore.document import Document +from typing import List, Tuple +import numpy as np +from crazy_functions.vector_fns.general_file_loader import load_file + +embedding_model_dict = { + "ernie-tiny": "nghuyong/ernie-3.0-nano-zh", + "ernie-base": "nghuyong/ernie-3.0-base-zh", + "text2vec-base": "shibing624/text2vec-base-chinese", + "text2vec": "GanymedeNil/text2vec-large-chinese", +} + +# Embedding model name +EMBEDDING_MODEL = "text2vec" + +# Embedding running device +EMBEDDING_DEVICE = "cpu" + +# 基于上下文的prompt模版,请务必保留"{question}"和"{context}" +PROMPT_TEMPLATE = """已知信息: +{context} + +根据上述已知信息,简洁和专业的来回答用户的问题。如果无法从中得到答案,请说 “根据已知信息无法回答该问题” 或 “没有提供足够的相关信息”,不允许在答案中添加编造成分,答案请使用中文。 问题是:{question}""" + +# 文本分句长度 +SENTENCE_SIZE = 100 + +# 匹配后单段上下文长度 +CHUNK_SIZE = 250 + +# LLM input history length +LLM_HISTORY_LEN = 3 + +# return top-k text chunk from vector store +VECTOR_SEARCH_TOP_K = 5 + +# 知识检索内容相关度 Score, 数值范围约为0-1100,如果为0,则不生效,经测试设置为小于500时,匹配结果更精准 +VECTOR_SEARCH_SCORE_THRESHOLD = 0 + +NLTK_DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "nltk_data") + +FLAG_USER_NAME = uuid.uuid4().hex + +# 是否开启跨域,默认为False,如果需要开启,请设置为True +# is open cross domain +OPEN_CROSS_DOMAIN = False + +def similarity_search_with_score_by_vector( + self, embedding: List[float], k: int = 4 +) -> List[Tuple[Document, float]]: + + def seperate_list(ls: List[int]) -> List[List[int]]: + lists = [] + ls1 = [ls[0]] + for i in range(1, len(ls)): + if ls[i - 1] + 1 == ls[i]: + ls1.append(ls[i]) + else: + lists.append(ls1) + ls1 = [ls[i]] + lists.append(ls1) + return lists + + scores, indices = self.index.search(np.array([embedding], dtype=np.float32), k) + docs = [] + id_set = set() + store_len = len(self.index_to_docstore_id) + for j, i in enumerate(indices[0]): + if i == -1 or 0 < self.score_threshold < scores[0][j]: + # This happens when not enough docs are returned. + continue + _id = self.index_to_docstore_id[i] + doc = self.docstore.search(_id) + if not self.chunk_conent: + if not isinstance(doc, Document): + raise ValueError(f"Could not find document for id {_id}, got {doc}") + doc.metadata["score"] = int(scores[0][j]) + docs.append(doc) + continue + id_set.add(i) + docs_len = len(doc.page_content) + for k in range(1, max(i, store_len - i)): + break_flag = False + for l in [i + k, i - k]: + if 0 <= l < len(self.index_to_docstore_id): + _id0 = self.index_to_docstore_id[l] + doc0 = self.docstore.search(_id0) + if docs_len + len(doc0.page_content) > self.chunk_size: + break_flag = True + break + elif doc0.metadata["source"] == doc.metadata["source"]: + docs_len += len(doc0.page_content) + id_set.add(l) + if break_flag: + break + if not self.chunk_conent: + return docs + if len(id_set) == 0 and self.score_threshold > 0: + return [] + id_list = sorted(list(id_set)) + id_lists = seperate_list(id_list) + for id_seq in id_lists: + for id in id_seq: + if id == id_seq[0]: + _id = self.index_to_docstore_id[id] + doc = self.docstore.search(_id) + else: + _id0 = self.index_to_docstore_id[id] + doc0 = self.docstore.search(_id0) + doc.page_content += " " + doc0.page_content + if not isinstance(doc, Document): + raise ValueError(f"Could not find document for id {_id}, got {doc}") + doc_score = min([scores[0][id] for id in [indices[0].tolist().index(i) for i in id_seq if i in indices[0]]]) + doc.metadata["score"] = int(doc_score) + docs.append(doc) + return docs + + +class LocalDocQA: + llm: object = None + embeddings: object = None + top_k: int = VECTOR_SEARCH_TOP_K + chunk_size: int = CHUNK_SIZE + chunk_conent: bool = True + score_threshold: int = VECTOR_SEARCH_SCORE_THRESHOLD + + def init_cfg(self, + top_k=VECTOR_SEARCH_TOP_K, + ): + + self.llm = None + self.top_k = top_k + + def init_knowledge_vector_store(self, + filepath, + vs_path: str or os.PathLike = None, + sentence_size=SENTENCE_SIZE, + text2vec=None): + loaded_files = [] + failed_files = [] + if isinstance(filepath, str): + if not os.path.exists(filepath): + print("路径不存在") + return None + elif os.path.isfile(filepath): + file = os.path.split(filepath)[-1] + try: + docs = load_file(filepath, SENTENCE_SIZE) + print(f"{file} 已成功加载") + loaded_files.append(filepath) + except Exception as e: + print(e) + print(f"{file} 未能成功加载") + return None + elif os.path.isdir(filepath): + docs = [] + for file in tqdm(os.listdir(filepath), desc="加载文件"): + fullfilepath = os.path.join(filepath, file) + try: + docs += load_file(fullfilepath, SENTENCE_SIZE) + loaded_files.append(fullfilepath) + except Exception as e: + print(e) + failed_files.append(file) + + if len(failed_files) > 0: + print("以下文件未能成功加载:") + for file in failed_files: + print(f"{file}\n") + + else: + docs = [] + for file in filepath: + docs += load_file(file, SENTENCE_SIZE) + print(f"{file} 已成功加载") + loaded_files.append(file) + + if len(docs) > 0: + print("文件加载完毕,正在生成向量库") + if vs_path and os.path.isdir(vs_path): + try: + self.vector_store = FAISS.load_local(vs_path, text2vec) + self.vector_store.add_documents(docs) + except: + self.vector_store = FAISS.from_documents(docs, text2vec) + else: + self.vector_store = FAISS.from_documents(docs, text2vec) # docs 为Document列表 + + self.vector_store.save_local(vs_path) + return vs_path, loaded_files + else: + raise RuntimeError("文件加载失败,请检查文件格式是否正确") + + def get_loaded_file(self, vs_path): + ds = self.vector_store.docstore + return set([ds._dict[k].metadata['source'].split(vs_path)[-1] for k in ds._dict]) + + + # query 查询内容 + # vs_path 知识库路径 + # chunk_conent 是否启用上下文关联 + # score_threshold 搜索匹配score阈值 + # vector_search_top_k 搜索知识库内容条数,默认搜索5条结果 + # chunk_sizes 匹配单段内容的连接上下文长度 + def get_knowledge_based_conent_test(self, query, vs_path, chunk_conent, + score_threshold=VECTOR_SEARCH_SCORE_THRESHOLD, + vector_search_top_k=VECTOR_SEARCH_TOP_K, chunk_size=CHUNK_SIZE, + text2vec=None): + self.vector_store = FAISS.load_local(vs_path, text2vec) + self.vector_store.chunk_conent = chunk_conent + self.vector_store.score_threshold = score_threshold + self.vector_store.chunk_size = chunk_size + + embedding = self.vector_store.embedding_function.embed_query(query) + related_docs_with_score = similarity_search_with_score_by_vector(self.vector_store, embedding, k=vector_search_top_k) + + if not related_docs_with_score: + response = {"query": query, + "source_documents": []} + return response, "" + # prompt = f"{query}. You should answer this question using information from following documents: \n\n" + prompt = f"{query}. 你必须利用以下文档中包含的信息回答这个问题: \n\n---\n\n" + prompt += "\n\n".join([f"({k}): " + doc.page_content for k, doc in enumerate(related_docs_with_score)]) + prompt += "\n\n---\n\n" + prompt = prompt.encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars + # print(prompt) + response = {"query": query, "source_documents": related_docs_with_score} + return response, prompt + + + + +def construct_vector_store(vs_id, vs_path, files, sentence_size, history, one_conent, one_content_segmentation, text2vec): + for file in files: + assert os.path.exists(file), "输入文件不存在:" + file + import nltk + if NLTK_DATA_PATH not in nltk.data.path: nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path + local_doc_qa = LocalDocQA() + local_doc_qa.init_cfg() + filelist = [] + if not os.path.exists(os.path.join(vs_path, vs_id)): + os.makedirs(os.path.join(vs_path, vs_id)) + for file in files: + file_name = file.name if not isinstance(file, str) else file + filename = os.path.split(file_name)[-1] + shutil.copyfile(file_name, os.path.join(vs_path, vs_id, filename)) + filelist.append(os.path.join(vs_path, vs_id, filename)) + vs_path, loaded_files = local_doc_qa.init_knowledge_vector_store(filelist, os.path.join(vs_path, vs_id), sentence_size, text2vec) + + if len(loaded_files): + file_status = f"已添加 {'、'.join([os.path.split(i)[-1] for i in loaded_files if i])} 内容至知识库,并已加载知识库,请开始提问" + else: + pass + # file_status = "文件未成功加载,请重新上传文件" + # print(file_status) + return local_doc_qa, vs_path + +@Singleton +class knowledge_archive_interface(): + def __init__(self) -> None: + self.threadLock = threading.Lock() + self.current_id = "" + self.kai_path = None + self.qa_handle = None + self.text2vec_large_chinese = None + + def get_chinese_text2vec(self): + if self.text2vec_large_chinese is None: + # < -------------------预热文本向量化模组--------------- > + from toolbox import ProxyNetworkActivate + print('Checking Text2vec ...') + from langchain.embeddings.huggingface import HuggingFaceEmbeddings + with ProxyNetworkActivate('Download_LLM'): # 临时地激活代理网络 + self.text2vec_large_chinese = HuggingFaceEmbeddings(model_name="GanymedeNil/text2vec-large-chinese") + + return self.text2vec_large_chinese + + + def feed_archive(self, file_manifest, vs_path, id="default"): + self.threadLock.acquire() + # import uuid + self.current_id = id + self.qa_handle, self.kai_path = construct_vector_store( + vs_id=self.current_id, + vs_path=vs_path, + files=file_manifest, + sentence_size=100, + history=[], + one_conent="", + one_content_segmentation="", + text2vec = self.get_chinese_text2vec(), + ) + self.threadLock.release() + + def get_current_archive_id(self): + return self.current_id + + def get_loaded_file(self, vs_path): + return self.qa_handle.get_loaded_file(vs_path) + + def answer_with_archive_by_id(self, txt, id, vs_path): + self.threadLock.acquire() + if not self.current_id == id: + self.current_id = id + self.qa_handle, self.kai_path = construct_vector_store( + vs_id=self.current_id, + vs_path=vs_path, + files=[], + sentence_size=100, + history=[], + one_conent="", + one_content_segmentation="", + text2vec = self.get_chinese_text2vec(), + ) + VECTOR_SEARCH_SCORE_THRESHOLD = 0 + VECTOR_SEARCH_TOP_K = 4 + CHUNK_SIZE = 512 + resp, prompt = self.qa_handle.get_knowledge_based_conent_test( + query = txt, + vs_path = self.kai_path, + score_threshold=VECTOR_SEARCH_SCORE_THRESHOLD, + vector_search_top_k=VECTOR_SEARCH_TOP_K, + chunk_conent=True, + chunk_size=CHUNK_SIZE, + text2vec = self.get_chinese_text2vec(), + ) + self.threadLock.release() + return resp, prompt \ No newline at end of file diff --git a/crazy_functions/Langchain知识库.py b/crazy_functions/知识库问答.py similarity index 69% rename from crazy_functions/Langchain知识库.py rename to crazy_functions/知识库问答.py index 8433895..e1cd00c 100644 --- a/crazy_functions/Langchain知识库.py +++ b/crazy_functions/知识库问答.py @@ -1,10 +1,19 @@ -from toolbox import CatchException, update_ui, ProxyNetworkActivate, update_ui_lastest_msg +from toolbox import CatchException, update_ui, ProxyNetworkActivate, update_ui_lastest_msg, get_log_folder, get_user from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive, get_files_from_everything +install_msg =""" +1. python -m pip install torch --index-url https://download.pytorch.org/whl/cpu + +2. python -m pip install transformers protobuf langchain sentence-transformers faiss-cpu nltk beautifulsoup4 bitsandbytes tabulate icetk --upgrade + +3. python -m pip install unstructured[all-docs] --upgrade + +4. python -c 'import nltk; nltk.download("punkt")' +""" @CatchException -def 知识库问答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): +def 知识库文件注入(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): """ txt 输入栏用户输入的文本,例如需要翻译的一段话,再例如一个包含了待处理文件的路径 llm_kwargs gpt模型参数, 如温度和top_p等, 一般原样传递下去就行 @@ -25,15 +34,15 @@ def 知识库问答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_pro # resolve deps try: - from zh_langchain import construct_vector_store - from langchain.embeddings.huggingface import HuggingFaceEmbeddings - from .crazy_utils import knowledge_archive_interface + # from zh_langchain import construct_vector_store + # from langchain.embeddings.huggingface import HuggingFaceEmbeddings + from crazy_functions.vector_fns.vector_database import knowledge_archive_interface except Exception as e: - chatbot.append(["依赖不足", "导入依赖失败。正在尝试自动安装,请查看终端的输出或耐心等待..."]) + chatbot.append(["依赖不足", f"{str(e)}\n\n导入依赖失败。请用以下命令安装" + install_msg]) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - from .crazy_utils import try_install_deps - try_install_deps(['zh_langchain==0.2.1', 'pypinyin'], reload_m=['pypinyin', 'zh_langchain']) - yield from update_ui_lastest_msg("安装完成,您可以再次重试。", chatbot, history) + # from .crazy_utils import try_install_deps + # try_install_deps(['zh_langchain==0.2.1', 'pypinyin'], reload_m=['pypinyin', 'zh_langchain']) + # yield from update_ui_lastest_msg("安装完成,您可以再次重试。", chatbot, history) return # < --------------------读取文件--------------- > @@ -42,7 +51,7 @@ def 知识库问答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_pro for sp in spl: _, file_manifest_tmp, _ = get_files_from_everything(txt, type=f'.{sp}') file_manifest += file_manifest_tmp - + if len(file_manifest) == 0: chatbot.append(["没有找到任何可读取文件", "当前支持的格式包括: txt, md, docx, pptx, pdf, json等"]) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 @@ -62,13 +71,14 @@ def 知识库问答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_pro print('Establishing knowledge archive ...') with ProxyNetworkActivate('Download_LLM'): # 临时地激活代理网络 kai = knowledge_archive_interface() - kai.feed_archive(file_manifest=file_manifest, id=kai_id) - kai_files = kai.get_loaded_file() + vs_path = get_log_folder(user=get_user(chatbot), plugin_name='vec_store') + kai.feed_archive(file_manifest=file_manifest, vs_path=vs_path, id=kai_id) + kai_files = kai.get_loaded_file(vs_path=vs_path) kai_files = '
'.join(kai_files) # chatbot.append(['知识库构建成功', "正在将知识库存储至cookie中"]) # yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # chatbot._cookies['langchain_plugin_embedding'] = kai.get_current_archive_id() - # chatbot._cookies['lock_plugin'] = 'crazy_functions.Langchain知识库->读取知识库作答' + # chatbot._cookies['lock_plugin'] = 'crazy_functions.知识库文件注入->读取知识库作答' # chatbot.append(['完成', "“根据知识库作答”函数插件已经接管问答系统, 提问吧! 但注意, 您接下来不能再使用其他插件了,刷新页面即可以退出知识库问答模式。"]) chatbot.append(['构建完成', f"当前知识库内的有效文件:\n\n---\n\n{kai_files}\n\n---\n\n请切换至“知识库问答”插件进行知识库访问, 或者使用此插件继续上传更多文件。"]) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间,我们先及时地做一次界面更新 @@ -77,15 +87,15 @@ def 知识库问答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_pro def 读取知识库作答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port=-1): # resolve deps try: - from zh_langchain import construct_vector_store - from langchain.embeddings.huggingface import HuggingFaceEmbeddings - from .crazy_utils import knowledge_archive_interface + # from zh_langchain import construct_vector_store + # from langchain.embeddings.huggingface import HuggingFaceEmbeddings + from crazy_functions.vector_fns.vector_database import knowledge_archive_interface except Exception as e: - chatbot.append(["依赖不足", "导入依赖失败。正在尝试自动安装,请查看终端的输出或耐心等待..."]) + chatbot.append(["依赖不足", f"{str(e)}\n\n导入依赖失败。请用以下命令安装" + install_msg]) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - from .crazy_utils import try_install_deps - try_install_deps(['zh_langchain==0.2.1', 'pypinyin'], reload_m=['pypinyin', 'zh_langchain']) - yield from update_ui_lastest_msg("安装完成,您可以再次重试。", chatbot, history) + # from .crazy_utils import try_install_deps + # try_install_deps(['zh_langchain==0.2.1', 'pypinyin'], reload_m=['pypinyin', 'zh_langchain']) + # yield from update_ui_lastest_msg("安装完成,您可以再次重试。", chatbot, history) return # < ------------------- --------------- > @@ -93,7 +103,8 @@ def 读取知识库作答(txt, llm_kwargs, plugin_kwargs, chatbot, history, syst if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg") kai_id = plugin_kwargs.get("advanced_arg", 'default') - resp, prompt = kai.answer_with_archive_by_id(txt, kai_id) + vs_path = get_log_folder(user=get_user(chatbot), plugin_name='vec_store') + resp, prompt = kai.answer_with_archive_by_id(txt, kai_id, vs_path) chatbot.append((txt, f'[知识库 {kai_id}] ' + prompt)) yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间,我们先及时地做一次界面更新 diff --git a/docs/GithubAction+NoLocal+Vectordb b/docs/GithubAction+NoLocal+Vectordb new file mode 100644 index 0000000..98595e3 --- /dev/null +++ b/docs/GithubAction+NoLocal+Vectordb @@ -0,0 +1,26 @@ +# 此Dockerfile适用于“无本地模型”的环境构建,如果需要使用chatglm等本地模型,请参考 docs/Dockerfile+ChatGLM +# 如何构建: 先修改 `config.py`, 然后 docker build -t gpt-academic-nolocal-vs -f docs/GithubAction+NoLocal+Vectordb . +# 如何运行: docker run --rm -it --net=host gpt-academic-nolocal-vs +FROM python:3.11 + +# 指定路径 +WORKDIR /gpt + +# 装载项目文件 +COPY . . + +# 安装依赖 +RUN pip3 install -r requirements.txt + +# 安装知识库插件的额外依赖 +RUN apt-get update && apt-get install libgl1 -y +RUN pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cpu +RUN pip3 install transformers protobuf langchain sentence-transformers faiss-cpu nltk beautifulsoup4 bitsandbytes tabulate icetk --upgrade +RUN pip3 install unstructured[all-docs] --upgrade + +# 可选步骤,用于预热模块 +RUN python3 -c 'from check_proxy import warm_up_modules; warm_up_modules()' +RUN python3 -c 'from check_proxy import warm_up_vectordb; warm_up_vectordb()' + +# 启动 +CMD ["python3", "-u", "main.py"] diff --git a/tests/test_plugins.py b/tests/test_plugins.py index 8470895..13ec259 100644 --- a/tests/test_plugins.py +++ b/tests/test_plugins.py @@ -48,11 +48,11 @@ if __name__ == "__main__": # for lang in ["English", "French", "Japanese", "Korean", "Russian", "Italian", "German", "Portuguese", "Arabic"]: # plugin_test(plugin='crazy_functions.批量Markdown翻译->Markdown翻译指定语言', main_input="README.md", advanced_arg={"advanced_arg": lang}) - # plugin_test(plugin='crazy_functions.Langchain知识库->知识库问答', main_input="./") + # plugin_test(plugin='crazy_functions.知识库文件注入->知识库文件注入', main_input="./") - # plugin_test(plugin='crazy_functions.Langchain知识库->读取知识库作答', main_input="What is the installation method?") + # plugin_test(plugin='crazy_functions.知识库文件注入->读取知识库作答', main_input="What is the installation method?") - # plugin_test(plugin='crazy_functions.Langchain知识库->读取知识库作答', main_input="远程云服务器部署?") + # plugin_test(plugin='crazy_functions.知识库文件注入->读取知识库作答', main_input="远程云服务器部署?") # plugin_test(plugin='crazy_functions.Latex输出PDF结果->Latex翻译中文并重新编译PDF', main_input="2210.03629") diff --git a/tests/test_utils.py b/tests/test_utils.py index 1fdca1e..c87908f 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -56,11 +56,11 @@ vt.get_plugin_handle = silence_stdout_fn(get_plugin_handle) vt.get_plugin_default_kwargs = silence_stdout_fn(get_plugin_default_kwargs) vt.get_chat_handle = silence_stdout_fn(get_chat_handle) vt.get_chat_default_kwargs = silence_stdout_fn(get_chat_default_kwargs) -vt.chat_to_markdown_str = chat_to_markdown_str +vt.chat_to_markdown_str = (chat_to_markdown_str) proxies, WEB_PORT, LLM_MODEL, CONCURRENT_COUNT, AUTHENTICATION, CHATBOT_HEIGHT, LAYOUT, API_KEY = \ vt.get_conf('proxies', 'WEB_PORT', 'LLM_MODEL', 'CONCURRENT_COUNT', 'AUTHENTICATION', 'CHATBOT_HEIGHT', 'LAYOUT', 'API_KEY') -def plugin_test(main_input, plugin, advanced_arg=None): +def plugin_test(main_input, plugin, advanced_arg=None, debug=True): from rich.live import Live from rich.markdown import Markdown @@ -72,7 +72,10 @@ def plugin_test(main_input, plugin, advanced_arg=None): plugin_kwargs['main_input'] = main_input if advanced_arg is not None: plugin_kwargs['plugin_kwargs'] = advanced_arg - my_working_plugin = silence_stdout(plugin)(**plugin_kwargs) + if debug: + my_working_plugin = (plugin)(**plugin_kwargs) + else: + my_working_plugin = silence_stdout(plugin)(**plugin_kwargs) with Live(Markdown(""), auto_refresh=False, vertical_overflow="visible") as live: for cookies, chat, hist, msg in my_working_plugin: diff --git a/tests/test_vector_plugins.py b/tests/test_vector_plugins.py new file mode 100644 index 0000000..9b75463 --- /dev/null +++ b/tests/test_vector_plugins.py @@ -0,0 +1,17 @@ +""" +对项目中的各个插件进行测试。运行方法:直接运行 python tests/test_plugins.py +""" + + +import os, sys +def validate_path(): dir_name = os.path.dirname(__file__); root_dir_assume = os.path.abspath(dir_name + '/..'); os.chdir(root_dir_assume); sys.path.append(root_dir_assume) +validate_path() # 返回项目根路径 + +if __name__ == "__main__": + from tests.test_utils import plugin_test + + plugin_test(plugin='crazy_functions.知识库问答->知识库文件注入', main_input="./README.md") + + plugin_test(plugin='crazy_functions.知识库问答->读取知识库作答', main_input="What is the installation method?") + + plugin_test(plugin='crazy_functions.知识库问答->读取知识库作答', main_input="远程云服务器部署?") \ No newline at end of file