new vector store establishment
This commit is contained in:
parent
647d9f88db
commit
ec60a85cac
@ -440,7 +440,7 @@ def get_crazy_functions():
|
||||
print('Load function plugin failed')
|
||||
|
||||
try:
|
||||
from crazy_functions.Langchain知识库 import 知识库问答
|
||||
from crazy_functions.知识库问答 import 知识库问答
|
||||
function_plugins.update({
|
||||
"构建知识库(先上传文件素材,再运行此插件)": {
|
||||
"Group": "对话",
|
||||
@ -456,7 +456,7 @@ def get_crazy_functions():
|
||||
print('Load function plugin failed')
|
||||
|
||||
try:
|
||||
from crazy_functions.Langchain知识库 import 读取知识库作答
|
||||
from crazy_functions.知识库问答 import 读取知识库作答
|
||||
function_plugins.update({
|
||||
"知识库问答(构建知识库后,再运行此插件)": {
|
||||
"Group": "对话",
|
||||
|
@ -1,4 +1,4 @@
|
||||
from toolbox import update_ui, get_conf, trimmed_format_exc, get_max_token
|
||||
from toolbox import update_ui, get_conf, trimmed_format_exc, get_max_token, Singleton
|
||||
import threading
|
||||
import os
|
||||
import logging
|
||||
@ -632,89 +632,6 @@ def get_files_from_everything(txt, type): # type='.md'
|
||||
|
||||
|
||||
|
||||
def Singleton(cls):
|
||||
_instance = {}
|
||||
|
||||
def _singleton(*args, **kargs):
|
||||
if cls not in _instance:
|
||||
_instance[cls] = cls(*args, **kargs)
|
||||
return _instance[cls]
|
||||
|
||||
return _singleton
|
||||
|
||||
|
||||
@Singleton
|
||||
class knowledge_archive_interface():
|
||||
def __init__(self) -> None:
|
||||
self.threadLock = threading.Lock()
|
||||
self.current_id = ""
|
||||
self.kai_path = None
|
||||
self.qa_handle = None
|
||||
self.text2vec_large_chinese = None
|
||||
|
||||
def get_chinese_text2vec(self):
|
||||
if self.text2vec_large_chinese is None:
|
||||
# < -------------------预热文本向量化模组--------------- >
|
||||
from toolbox import ProxyNetworkActivate
|
||||
print('Checking Text2vec ...')
|
||||
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
|
||||
with ProxyNetworkActivate('Download_LLM'): # 临时地激活代理网络
|
||||
self.text2vec_large_chinese = HuggingFaceEmbeddings(model_name="GanymedeNil/text2vec-large-chinese")
|
||||
|
||||
return self.text2vec_large_chinese
|
||||
|
||||
|
||||
def feed_archive(self, file_manifest, id="default"):
|
||||
self.threadLock.acquire()
|
||||
# import uuid
|
||||
self.current_id = id
|
||||
from zh_langchain import construct_vector_store
|
||||
self.qa_handle, self.kai_path = construct_vector_store(
|
||||
vs_id=self.current_id,
|
||||
files=file_manifest,
|
||||
sentence_size=100,
|
||||
history=[],
|
||||
one_conent="",
|
||||
one_content_segmentation="",
|
||||
text2vec = self.get_chinese_text2vec(),
|
||||
)
|
||||
self.threadLock.release()
|
||||
|
||||
def get_current_archive_id(self):
|
||||
return self.current_id
|
||||
|
||||
def get_loaded_file(self):
|
||||
return self.qa_handle.get_loaded_file()
|
||||
|
||||
def answer_with_archive_by_id(self, txt, id):
|
||||
self.threadLock.acquire()
|
||||
if not self.current_id == id:
|
||||
self.current_id = id
|
||||
from zh_langchain import construct_vector_store
|
||||
self.qa_handle, self.kai_path = construct_vector_store(
|
||||
vs_id=self.current_id,
|
||||
files=[],
|
||||
sentence_size=100,
|
||||
history=[],
|
||||
one_conent="",
|
||||
one_content_segmentation="",
|
||||
text2vec = self.get_chinese_text2vec(),
|
||||
)
|
||||
VECTOR_SEARCH_SCORE_THRESHOLD = 0
|
||||
VECTOR_SEARCH_TOP_K = 4
|
||||
CHUNK_SIZE = 512
|
||||
resp, prompt = self.qa_handle.get_knowledge_based_conent_test(
|
||||
query = txt,
|
||||
vs_path = self.kai_path,
|
||||
score_threshold=VECTOR_SEARCH_SCORE_THRESHOLD,
|
||||
vector_search_top_k=VECTOR_SEARCH_TOP_K,
|
||||
chunk_conent=True,
|
||||
chunk_size=CHUNK_SIZE,
|
||||
text2vec = self.get_chinese_text2vec(),
|
||||
)
|
||||
self.threadLock.release()
|
||||
return resp, prompt
|
||||
|
||||
@Singleton
|
||||
class nougat_interface():
|
||||
def __init__(self):
|
||||
|
70
crazy_functions/vector_fns/general_file_loader.py
Normal file
70
crazy_functions/vector_fns/general_file_loader.py
Normal file
@ -0,0 +1,70 @@
|
||||
# From project chatglm-langchain
|
||||
|
||||
|
||||
from langchain.document_loaders import UnstructuredFileLoader
|
||||
from langchain.text_splitter import CharacterTextSplitter
|
||||
import re
|
||||
from typing import List
|
||||
|
||||
class ChineseTextSplitter(CharacterTextSplitter):
|
||||
def __init__(self, pdf: bool = False, sentence_size: int = None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.pdf = pdf
|
||||
self.sentence_size = sentence_size
|
||||
|
||||
def split_text1(self, text: str) -> List[str]:
|
||||
if self.pdf:
|
||||
text = re.sub(r"\n{3,}", "\n", text)
|
||||
text = re.sub('\s', ' ', text)
|
||||
text = text.replace("\n\n", "")
|
||||
sent_sep_pattern = re.compile('([﹒﹔﹖﹗.。!?]["’”」』]{0,2}|(?=["‘“「『]{1,2}|$))') # del :;
|
||||
sent_list = []
|
||||
for ele in sent_sep_pattern.split(text):
|
||||
if sent_sep_pattern.match(ele) and sent_list:
|
||||
sent_list[-1] += ele
|
||||
elif ele:
|
||||
sent_list.append(ele)
|
||||
return sent_list
|
||||
|
||||
def split_text(self, text: str) -> List[str]: ##此处需要进一步优化逻辑
|
||||
if self.pdf:
|
||||
text = re.sub(r"\n{3,}", r"\n", text)
|
||||
text = re.sub('\s', " ", text)
|
||||
text = re.sub("\n\n", "", text)
|
||||
|
||||
text = re.sub(r'([;;.!?。!?\?])([^”’])', r"\1\n\2", text) # 单字符断句符
|
||||
text = re.sub(r'(\.{6})([^"’”」』])', r"\1\n\2", text) # 英文省略号
|
||||
text = re.sub(r'(\…{2})([^"’”」』])', r"\1\n\2", text) # 中文省略号
|
||||
text = re.sub(r'([;;!?。!?\?]["’”」』]{0,2})([^;;!?,。!?\?])', r'\1\n\2', text)
|
||||
# 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后,注意前面的几句都小心保留了双引号
|
||||
text = text.rstrip() # 段尾如果有多余的\n就去掉它
|
||||
# 很多规则中会考虑分号;,但是这里我把它忽略不计,破折号、英文双引号等同样忽略,需要的再做些简单调整即可。
|
||||
ls = [i for i in text.split("\n") if i]
|
||||
for ele in ls:
|
||||
if len(ele) > self.sentence_size:
|
||||
ele1 = re.sub(r'([,,.]["’”」』]{0,2})([^,,.])', r'\1\n\2', ele)
|
||||
ele1_ls = ele1.split("\n")
|
||||
for ele_ele1 in ele1_ls:
|
||||
if len(ele_ele1) > self.sentence_size:
|
||||
ele_ele2 = re.sub(r'([\n]{1,}| {2,}["’”」』]{0,2})([^\s])', r'\1\n\2', ele_ele1)
|
||||
ele2_ls = ele_ele2.split("\n")
|
||||
for ele_ele2 in ele2_ls:
|
||||
if len(ele_ele2) > self.sentence_size:
|
||||
ele_ele3 = re.sub('( ["’”」』]{0,2})([^ ])', r'\1\n\2', ele_ele2)
|
||||
ele2_id = ele2_ls.index(ele_ele2)
|
||||
ele2_ls = ele2_ls[:ele2_id] + [i for i in ele_ele3.split("\n") if i] + ele2_ls[
|
||||
ele2_id + 1:]
|
||||
ele_id = ele1_ls.index(ele_ele1)
|
||||
ele1_ls = ele1_ls[:ele_id] + [i for i in ele2_ls if i] + ele1_ls[ele_id + 1:]
|
||||
|
||||
id = ls.index(ele)
|
||||
ls = ls[:id] + [i for i in ele1_ls if i] + ls[id + 1:]
|
||||
return ls
|
||||
|
||||
def load_file(filepath, sentence_size):
|
||||
loader = UnstructuredFileLoader(filepath, mode="elements")
|
||||
textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
|
||||
docs = loader.load_and_split(text_splitter=textsplitter)
|
||||
# write_check_file(filepath, docs)
|
||||
return docs
|
||||
|
349
crazy_functions/vector_fns/vector_database.py
Normal file
349
crazy_functions/vector_fns/vector_database.py
Normal file
@ -0,0 +1,349 @@
|
||||
# From project chatglm-langchain
|
||||
|
||||
import threading
|
||||
from toolbox import Singleton
|
||||
import os
|
||||
import shutil
|
||||
import os
|
||||
import uuid
|
||||
import tqdm
|
||||
from langchain.vectorstores import FAISS
|
||||
from langchain.docstore.document import Document
|
||||
from typing import List, Tuple
|
||||
import numpy as np
|
||||
from crazy_functions.vector_fns.general_file_loader import load_file
|
||||
|
||||
embedding_model_dict = {
|
||||
"ernie-tiny": "nghuyong/ernie-3.0-nano-zh",
|
||||
"ernie-base": "nghuyong/ernie-3.0-base-zh",
|
||||
"text2vec-base": "shibing624/text2vec-base-chinese",
|
||||
"text2vec": "GanymedeNil/text2vec-large-chinese",
|
||||
}
|
||||
|
||||
# Embedding model name
|
||||
EMBEDDING_MODEL = "text2vec"
|
||||
|
||||
# Embedding running device
|
||||
EMBEDDING_DEVICE = "cpu"
|
||||
|
||||
VS_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "vector_store")
|
||||
|
||||
UPLOAD_ROOT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "content")
|
||||
|
||||
# 基于上下文的prompt模版,请务必保留"{question}"和"{context}"
|
||||
PROMPT_TEMPLATE = """已知信息:
|
||||
{context}
|
||||
|
||||
根据上述已知信息,简洁和专业的来回答用户的问题。如果无法从中得到答案,请说 “根据已知信息无法回答该问题” 或 “没有提供足够的相关信息”,不允许在答案中添加编造成分,答案请使用中文。 问题是:{question}"""
|
||||
|
||||
# 文本分句长度
|
||||
SENTENCE_SIZE = 100
|
||||
|
||||
# 匹配后单段上下文长度
|
||||
CHUNK_SIZE = 250
|
||||
|
||||
# LLM input history length
|
||||
LLM_HISTORY_LEN = 3
|
||||
|
||||
# return top-k text chunk from vector store
|
||||
VECTOR_SEARCH_TOP_K = 5
|
||||
|
||||
# 知识检索内容相关度 Score, 数值范围约为0-1100,如果为0,则不生效,经测试设置为小于500时,匹配结果更精准
|
||||
VECTOR_SEARCH_SCORE_THRESHOLD = 0
|
||||
|
||||
NLTK_DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "nltk_data")
|
||||
|
||||
FLAG_USER_NAME = uuid.uuid4().hex
|
||||
|
||||
# 是否开启跨域,默认为False,如果需要开启,请设置为True
|
||||
# is open cross domain
|
||||
OPEN_CROSS_DOMAIN = False
|
||||
|
||||
def similarity_search_with_score_by_vector(
|
||||
self, embedding: List[float], k: int = 4
|
||||
) -> List[Tuple[Document, float]]:
|
||||
|
||||
def seperate_list(ls: List[int]) -> List[List[int]]:
|
||||
lists = []
|
||||
ls1 = [ls[0]]
|
||||
for i in range(1, len(ls)):
|
||||
if ls[i - 1] + 1 == ls[i]:
|
||||
ls1.append(ls[i])
|
||||
else:
|
||||
lists.append(ls1)
|
||||
ls1 = [ls[i]]
|
||||
lists.append(ls1)
|
||||
return lists
|
||||
|
||||
scores, indices = self.index.search(np.array([embedding], dtype=np.float32), k)
|
||||
docs = []
|
||||
id_set = set()
|
||||
store_len = len(self.index_to_docstore_id)
|
||||
for j, i in enumerate(indices[0]):
|
||||
if i == -1 or 0 < self.score_threshold < scores[0][j]:
|
||||
# This happens when not enough docs are returned.
|
||||
continue
|
||||
_id = self.index_to_docstore_id[i]
|
||||
doc = self.docstore.search(_id)
|
||||
if not self.chunk_conent:
|
||||
if not isinstance(doc, Document):
|
||||
raise ValueError(f"Could not find document for id {_id}, got {doc}")
|
||||
doc.metadata["score"] = int(scores[0][j])
|
||||
docs.append(doc)
|
||||
continue
|
||||
id_set.add(i)
|
||||
docs_len = len(doc.page_content)
|
||||
for k in range(1, max(i, store_len - i)):
|
||||
break_flag = False
|
||||
for l in [i + k, i - k]:
|
||||
if 0 <= l < len(self.index_to_docstore_id):
|
||||
_id0 = self.index_to_docstore_id[l]
|
||||
doc0 = self.docstore.search(_id0)
|
||||
if docs_len + len(doc0.page_content) > self.chunk_size:
|
||||
break_flag = True
|
||||
break
|
||||
elif doc0.metadata["source"] == doc.metadata["source"]:
|
||||
docs_len += len(doc0.page_content)
|
||||
id_set.add(l)
|
||||
if break_flag:
|
||||
break
|
||||
if not self.chunk_conent:
|
||||
return docs
|
||||
if len(id_set) == 0 and self.score_threshold > 0:
|
||||
return []
|
||||
id_list = sorted(list(id_set))
|
||||
id_lists = seperate_list(id_list)
|
||||
for id_seq in id_lists:
|
||||
for id in id_seq:
|
||||
if id == id_seq[0]:
|
||||
_id = self.index_to_docstore_id[id]
|
||||
doc = self.docstore.search(_id)
|
||||
else:
|
||||
_id0 = self.index_to_docstore_id[id]
|
||||
doc0 = self.docstore.search(_id0)
|
||||
doc.page_content += " " + doc0.page_content
|
||||
if not isinstance(doc, Document):
|
||||
raise ValueError(f"Could not find document for id {_id}, got {doc}")
|
||||
doc_score = min([scores[0][id] for id in [indices[0].tolist().index(i) for i in id_seq if i in indices[0]]])
|
||||
doc.metadata["score"] = int(doc_score)
|
||||
docs.append(doc)
|
||||
return docs
|
||||
|
||||
|
||||
class LocalDocQA:
|
||||
llm: object = None
|
||||
embeddings: object = None
|
||||
top_k: int = VECTOR_SEARCH_TOP_K
|
||||
chunk_size: int = CHUNK_SIZE
|
||||
chunk_conent: bool = True
|
||||
score_threshold: int = VECTOR_SEARCH_SCORE_THRESHOLD
|
||||
|
||||
def init_cfg(self,
|
||||
top_k=VECTOR_SEARCH_TOP_K,
|
||||
):
|
||||
|
||||
self.llm = None
|
||||
self.top_k = top_k
|
||||
|
||||
def init_knowledge_vector_store(self,
|
||||
filepath,
|
||||
vs_path: str or os.PathLike = None,
|
||||
sentence_size=SENTENCE_SIZE,
|
||||
text2vec=None):
|
||||
loaded_files = []
|
||||
failed_files = []
|
||||
if isinstance(filepath, str):
|
||||
if not os.path.exists(filepath):
|
||||
print("路径不存在")
|
||||
return None
|
||||
elif os.path.isfile(filepath):
|
||||
file = os.path.split(filepath)[-1]
|
||||
try:
|
||||
docs = load_file(filepath, sentence_size)
|
||||
print(f"{file} 已成功加载")
|
||||
loaded_files.append(filepath)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
print(f"{file} 未能成功加载")
|
||||
return None
|
||||
elif os.path.isdir(filepath):
|
||||
docs = []
|
||||
for file in tqdm(os.listdir(filepath), desc="加载文件"):
|
||||
fullfilepath = os.path.join(filepath, file)
|
||||
try:
|
||||
docs += load_file(fullfilepath, sentence_size)
|
||||
loaded_files.append(fullfilepath)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
failed_files.append(file)
|
||||
|
||||
if len(failed_files) > 0:
|
||||
print("以下文件未能成功加载:")
|
||||
for file in failed_files:
|
||||
print(f"{file}\n")
|
||||
|
||||
else:
|
||||
docs = []
|
||||
for file in filepath:
|
||||
try:
|
||||
docs += load_file(file)
|
||||
print(f"{file} 已成功加载")
|
||||
loaded_files.append(file)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
print(f"{file} 未能成功加载")
|
||||
|
||||
if len(docs) > 0:
|
||||
print("文件加载完毕,正在生成向量库")
|
||||
if vs_path and os.path.isdir(vs_path):
|
||||
self.vector_store = FAISS.load_local(vs_path, text2vec)
|
||||
self.vector_store.add_documents(docs)
|
||||
else:
|
||||
if not vs_path: assert False
|
||||
self.vector_store = FAISS.from_documents(docs, text2vec) # docs 为Document列表
|
||||
|
||||
self.vector_store.save_local(vs_path)
|
||||
return vs_path, loaded_files
|
||||
else:
|
||||
self.vector_store = FAISS.load_local(vs_path, text2vec)
|
||||
return vs_path, loaded_files
|
||||
|
||||
def get_loaded_file(self):
|
||||
ds = self.vector_store.docstore
|
||||
return set([ds._dict[k].metadata['source'].split(UPLOAD_ROOT_PATH)[-1] for k in ds._dict])
|
||||
|
||||
|
||||
# query 查询内容
|
||||
# vs_path 知识库路径
|
||||
# chunk_conent 是否启用上下文关联
|
||||
# score_threshold 搜索匹配score阈值
|
||||
# vector_search_top_k 搜索知识库内容条数,默认搜索5条结果
|
||||
# chunk_sizes 匹配单段内容的连接上下文长度
|
||||
def get_knowledge_based_conent_test(self, query, vs_path, chunk_conent,
|
||||
score_threshold=VECTOR_SEARCH_SCORE_THRESHOLD,
|
||||
vector_search_top_k=VECTOR_SEARCH_TOP_K, chunk_size=CHUNK_SIZE,
|
||||
text2vec=None):
|
||||
self.vector_store = FAISS.load_local(vs_path, text2vec)
|
||||
self.vector_store.chunk_conent = chunk_conent
|
||||
self.vector_store.score_threshold = score_threshold
|
||||
self.vector_store.chunk_size = chunk_size
|
||||
|
||||
embedding = self.vector_store.embedding_function(query)
|
||||
related_docs_with_score = similarity_search_with_score_by_vector(self.vector_store, embedding, k=vector_search_top_k)
|
||||
|
||||
if not related_docs_with_score:
|
||||
response = {"query": query,
|
||||
"source_documents": []}
|
||||
return response, ""
|
||||
# prompt = f"{query}. You should answer this question using information from following documents: \n\n"
|
||||
prompt = f"{query}. 你必须利用以下文档中包含的信息回答这个问题: \n\n---\n\n"
|
||||
prompt += "\n\n".join([f"({k}): " + doc.page_content for k, doc in enumerate(related_docs_with_score)])
|
||||
prompt += "\n\n---\n\n"
|
||||
prompt = prompt.encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars
|
||||
# print(prompt)
|
||||
response = {"query": query, "source_documents": related_docs_with_score}
|
||||
return response, prompt
|
||||
|
||||
|
||||
|
||||
|
||||
def construct_vector_store(vs_id, files, sentence_size, history, one_conent, one_content_segmentation, text2vec):
|
||||
for file in files:
|
||||
assert os.path.exists(file), "输入文件不存在"
|
||||
import nltk
|
||||
if NLTK_DATA_PATH not in nltk.data.path: nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
|
||||
local_doc_qa = LocalDocQA()
|
||||
local_doc_qa.init_cfg()
|
||||
vs_path = os.path.join(VS_ROOT_PATH, vs_id)
|
||||
filelist = []
|
||||
if not os.path.exists(os.path.join(UPLOAD_ROOT_PATH, vs_id)):
|
||||
os.makedirs(os.path.join(UPLOAD_ROOT_PATH, vs_id))
|
||||
if isinstance(files, list):
|
||||
for file in files:
|
||||
file_name = file.name if not isinstance(file, str) else file
|
||||
filename = os.path.split(file_name)[-1]
|
||||
shutil.copyfile(file_name, os.path.join(UPLOAD_ROOT_PATH, vs_id, filename))
|
||||
filelist.append(os.path.join(UPLOAD_ROOT_PATH, vs_id, filename))
|
||||
vs_path, loaded_files = local_doc_qa.init_knowledge_vector_store(filelist, vs_path, sentence_size, text2vec)
|
||||
else:
|
||||
vs_path, loaded_files = local_doc_qa.one_knowledge_add(vs_path, files, one_conent, one_content_segmentation,
|
||||
sentence_size, text2vec)
|
||||
if len(loaded_files):
|
||||
file_status = f"已添加 {'、'.join([os.path.split(i)[-1] for i in loaded_files if i])} 内容至知识库,并已加载知识库,请开始提问"
|
||||
else:
|
||||
pass
|
||||
# file_status = "文件未成功加载,请重新上传文件"
|
||||
# print(file_status)
|
||||
return local_doc_qa, vs_path
|
||||
|
||||
@Singleton
|
||||
class knowledge_archive_interface():
|
||||
def __init__(self) -> None:
|
||||
self.threadLock = threading.Lock()
|
||||
self.current_id = ""
|
||||
self.kai_path = None
|
||||
self.qa_handle = None
|
||||
self.text2vec_large_chinese = None
|
||||
|
||||
def get_chinese_text2vec(self):
|
||||
if self.text2vec_large_chinese is None:
|
||||
# < -------------------预热文本向量化模组--------------- >
|
||||
from toolbox import ProxyNetworkActivate
|
||||
print('Checking Text2vec ...')
|
||||
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
|
||||
with ProxyNetworkActivate('Download_LLM'): # 临时地激活代理网络
|
||||
self.text2vec_large_chinese = HuggingFaceEmbeddings(model_name="GanymedeNil/text2vec-large-chinese")
|
||||
|
||||
return self.text2vec_large_chinese
|
||||
|
||||
|
||||
def feed_archive(self, file_manifest, id="default"):
|
||||
self.threadLock.acquire()
|
||||
# import uuid
|
||||
self.current_id = id
|
||||
from zh_langchain import construct_vector_store
|
||||
self.qa_handle, self.kai_path = construct_vector_store(
|
||||
vs_id=self.current_id,
|
||||
files=file_manifest,
|
||||
sentence_size=100,
|
||||
history=[],
|
||||
one_conent="",
|
||||
one_content_segmentation="",
|
||||
text2vec = self.get_chinese_text2vec(),
|
||||
)
|
||||
self.threadLock.release()
|
||||
|
||||
def get_current_archive_id(self):
|
||||
return self.current_id
|
||||
|
||||
def get_loaded_file(self):
|
||||
return self.qa_handle.get_loaded_file()
|
||||
|
||||
def answer_with_archive_by_id(self, txt, id):
|
||||
self.threadLock.acquire()
|
||||
if not self.current_id == id:
|
||||
self.current_id = id
|
||||
from zh_langchain import construct_vector_store
|
||||
self.qa_handle, self.kai_path = construct_vector_store(
|
||||
vs_id=self.current_id,
|
||||
files=[],
|
||||
sentence_size=100,
|
||||
history=[],
|
||||
one_conent="",
|
||||
one_content_segmentation="",
|
||||
text2vec = self.get_chinese_text2vec(),
|
||||
)
|
||||
VECTOR_SEARCH_SCORE_THRESHOLD = 0
|
||||
VECTOR_SEARCH_TOP_K = 4
|
||||
CHUNK_SIZE = 512
|
||||
resp, prompt = self.qa_handle.get_knowledge_based_conent_test(
|
||||
query = txt,
|
||||
vs_path = self.kai_path,
|
||||
score_threshold=VECTOR_SEARCH_SCORE_THRESHOLD,
|
||||
vector_search_top_k=VECTOR_SEARCH_TOP_K,
|
||||
chunk_conent=True,
|
||||
chunk_size=CHUNK_SIZE,
|
||||
text2vec = self.get_chinese_text2vec(),
|
||||
)
|
||||
self.threadLock.release()
|
||||
return resp, prompt
|
@ -68,7 +68,7 @@ def 知识库问答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_pro
|
||||
# chatbot.append(['知识库构建成功', "正在将知识库存储至cookie中"])
|
||||
# yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
|
||||
# chatbot._cookies['langchain_plugin_embedding'] = kai.get_current_archive_id()
|
||||
# chatbot._cookies['lock_plugin'] = 'crazy_functions.Langchain知识库->读取知识库作答'
|
||||
# chatbot._cookies['lock_plugin'] = 'crazy_functions.知识库问答->读取知识库作答'
|
||||
# chatbot.append(['完成', "“根据知识库作答”函数插件已经接管问答系统, 提问吧! 但注意, 您接下来不能再使用其他插件了,刷新页面即可以退出知识库问答模式。"])
|
||||
chatbot.append(['构建完成', f"当前知识库内的有效文件:\n\n---\n\n{kai_files}\n\n---\n\n请切换至“知识库问答”插件进行知识库访问, 或者使用此插件继续上传更多文件。"])
|
||||
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间,我们先及时地做一次界面更新
|
@ -1666,7 +1666,7 @@
|
||||
"连接bing搜索回答问题": "ConnectBingSearchAnswerQuestion",
|
||||
"联网的ChatGPT_bing版": "OnlineChatGPT_BingEdition",
|
||||
"Markdown翻译指定语言": "TranslateMarkdownToSpecifiedLanguage",
|
||||
"Langchain知识库": "LangchainKnowledgeBase",
|
||||
"知识库问答": "LangchainKnowledgeBase",
|
||||
"Latex英文纠错加PDF对比": "CorrectEnglishInLatexWithPDFComparison",
|
||||
"Latex输出PDF结果": "OutputPDFFromLatex",
|
||||
"Latex翻译中文并重新编译PDF": "TranslateChineseToEnglishInLatexAndRecompilePDF",
|
||||
|
@ -1487,7 +1487,7 @@
|
||||
"数学动画生成manim": "GenerateMathematicalAnimationManim",
|
||||
"Markdown翻译指定语言": "TranslateMarkdownSpecifiedLanguage",
|
||||
"知识库问答": "KnowledgeBaseQuestionAnswer",
|
||||
"Langchain知识库": "LangchainKnowledgeBase",
|
||||
"知识库问答": "LangchainKnowledgeBase",
|
||||
"读取知识库作答": "ReadKnowledgeBaseAnswer",
|
||||
"交互功能模板函数": "InteractiveFunctionTemplateFunction",
|
||||
"交互功能函数模板": "InteractiveFunctionFunctionTemplate",
|
||||
|
@ -15,7 +15,7 @@
|
||||
"删除所有本地对话历史记录": "DeleteAllLocalConversationHistoryRecords",
|
||||
"批量Markdown翻译": "BatchTranslateMarkdown",
|
||||
"连接bing搜索回答问题": "ConnectBingSearchAnswerQuestion",
|
||||
"Langchain知识库": "LangchainKnowledgeBase",
|
||||
"知识库问答": "LangchainKnowledgeBase",
|
||||
"Latex输出PDF结果": "OutputPDFFromLatex",
|
||||
"把字符太少的块清除为回车": "ClearBlocksWithTooFewCharactersToNewline",
|
||||
"Latex精细分解与转化": "DecomposeAndConvertLatex",
|
||||
|
@ -1463,7 +1463,7 @@
|
||||
"数学动画生成manim": "GenerateMathematicalAnimationsWithManim",
|
||||
"Markdown翻译指定语言": "TranslateMarkdownToSpecifiedLanguage",
|
||||
"知识库问答": "KnowledgeBaseQA",
|
||||
"Langchain知识库": "LangchainKnowledgeBase",
|
||||
"知识库问答": "LangchainKnowledgeBase",
|
||||
"读取知识库作答": "ReadKnowledgeBaseAndAnswerQuestions",
|
||||
"交互功能模板函数": "InteractiveFunctionTemplateFunctions",
|
||||
"交互功能函数模板": "InteractiveFunctionFunctionTemplates",
|
||||
|
@ -48,11 +48,11 @@ if __name__ == "__main__":
|
||||
# for lang in ["English", "French", "Japanese", "Korean", "Russian", "Italian", "German", "Portuguese", "Arabic"]:
|
||||
# plugin_test(plugin='crazy_functions.批量Markdown翻译->Markdown翻译指定语言', main_input="README.md", advanced_arg={"advanced_arg": lang})
|
||||
|
||||
# plugin_test(plugin='crazy_functions.Langchain知识库->知识库问答', main_input="./")
|
||||
# plugin_test(plugin='crazy_functions.知识库问答->知识库问答', main_input="./")
|
||||
|
||||
# plugin_test(plugin='crazy_functions.Langchain知识库->读取知识库作答', main_input="What is the installation method?")
|
||||
# plugin_test(plugin='crazy_functions.知识库问答->读取知识库作答', main_input="What is the installation method?")
|
||||
|
||||
# plugin_test(plugin='crazy_functions.Langchain知识库->读取知识库作答', main_input="远程云服务器部署?")
|
||||
# plugin_test(plugin='crazy_functions.知识库问答->读取知识库作答', main_input="远程云服务器部署?")
|
||||
|
||||
# plugin_test(plugin='crazy_functions.Latex输出PDF结果->Latex翻译中文并重新编译PDF', main_input="2210.03629")
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user