From 3acbf1ab171009c603c03db3fc25cc96fd9658ee Mon Sep 17 00:00:00 2001 From: Archer <545436317@qq.com> Date: Sat, 25 Nov 2023 21:58:00 +0800 Subject: [PATCH] 4.6.2-alpha (#517) --- packages/global/common/string/tools.ts | 3 +- packages/global/core/dataset/type.d.ts | 6 +- packages/global/core/module/node/type.d.ts | 2 +- packages/global/core/module/type.d.ts | 2 +- packages/service/core/dataset/data/schema.ts | 10 +- packages/service/package.json | 22 +-- pnpm-lock.yaml | 171 ++++++++++++++++ projects/app/package.json | 1 + projects/app/public/locales/en/common.json | 3 + projects/app/public/locales/zh/common.json | 3 + .../src/components/ChatBox/MessageInput.tsx | 2 +- .../src/components/ChatBox/ResponseTags.tsx | 2 +- .../app/src/global/core/api/datasetReq.d.ts | 7 - projects/app/src/global/core/dataset/api.d.ts | 17 +- projects/app/src/pages/_app.tsx | 2 +- projects/app/src/pages/api/admin/initv46.ts | 2 +- projects/app/src/pages/api/admin/initv462.ts | 46 +++-- projects/app/src/pages/api/core/app/create.ts | 4 +- .../core/app/form2Modules/fastgpt-simple.ts | 14 +- .../pages/api/core/dataset/data/insertData.ts | 11 +- .../pages/api/core/dataset/data/pushData.ts | 16 ++ .../src/pages/api/core/dataset/searchTest.ts | 12 +- .../app/src/pages/api/core/plugin/detail.ts | 5 +- .../pages/dataset/detail/components/Info.tsx | 7 +- .../pages/dataset/detail/components/Test.tsx | 17 +- .../dataset/list/component/CreateModal.tsx | 2 +- projects/app/src/pages/dataset/list/index.tsx | 12 +- .../app/src/pages/plugin/edit/Preview.tsx | 2 +- projects/app/src/pages/plugin/edit/index.tsx | 27 +-- .../pages/plugin/list/component/EditModal.tsx | 76 ++++++- projects/app/src/service/core/app/module.ts | 2 +- .../service/core/dataset/data/controller.ts | 6 +- .../app/src/service/core/dataset/data/pg.ts | 187 ++++++++++++------ .../app/src/service/core/dataset/utils.ts | 34 ++++ projects/app/src/service/events/generateQA.ts | 22 ++- .../app/src/service/events/generateVector.ts | 34 +++- projects/app/src/utils/sse.ts | 2 +- projects/app/src/web/core/dataset/api.ts | 6 +- .../src/web/core/dataset/store/searchTest.ts | 1 + 39 files changed, 617 insertions(+), 183 deletions(-) create mode 100644 projects/app/src/service/core/dataset/utils.ts diff --git a/packages/global/common/string/tools.ts b/packages/global/common/string/tools.ts index 5e4e7487c..485062b2d 100644 --- a/packages/global/common/string/tools.ts +++ b/packages/global/common/string/tools.ts @@ -13,7 +13,8 @@ export const hashStr = (str: string) => { }; /* simple text, remove chinese space and extra \n */ -export const simpleText = (text: string) => { +export const simpleText = (text = '') => { + text = text.trim(); text = text.replace(/([\u4e00-\u9fa5])[\s&&[^\n]]+([\u4e00-\u9fa5])/g, '$1$2'); text = text.replace(/\r\n|\r/g, '\n'); text = text.replace(/\n{3,}/g, '\n\n'); diff --git a/packages/global/core/dataset/type.d.ts b/packages/global/core/dataset/type.d.ts index 851e1e326..391be8cc5 100644 --- a/packages/global/core/dataset/type.d.ts +++ b/packages/global/core/dataset/type.d.ts @@ -59,6 +59,7 @@ export type DatasetDataSchemaType = { collectionId: string; q: string; // large chunks or question a: string; // answer or custom content + fullTextToken: string; indexes: DatasetDataIndexItemType[]; }; @@ -83,6 +84,9 @@ export type DatasetTrainingSchemaType = { export type CollectionWithDatasetType = Omit & { datasetId: DatasetSchemaType; }; +export type DatasetDataWithCollectionType = Omit & { + collectionId: DatasetCollectionSchemaType; +}; /* ================= dataset ===================== */ export type DatasetItemType = Omit & { @@ -130,6 +134,6 @@ export type DatasetFileSchema = { }; /* ============= search =============== */ -export type SearchDataResponseItemType = DatasetDataItemType & { +export type SearchDataResponseItemType = Omit & { score: number; }; diff --git a/packages/global/core/module/node/type.d.ts b/packages/global/core/module/node/type.d.ts index 8c0cc97b2..27effc322 100644 --- a/packages/global/core/module/node/type.d.ts +++ b/packages/global/core/module/node/type.d.ts @@ -20,7 +20,7 @@ export type FlowNodeChangeProps = { }; export type FlowNodeInputItemType = { - key: `${ModuleInputKeyEnum}`; + key: `${ModuleInputKeyEnum}` | string; type: `${FlowNodeInputTypeEnum}`; // Decide on a render style value?: any; valueType?: `${ModuleDataTypeEnum}`; // data type diff --git a/packages/global/core/module/type.d.ts b/packages/global/core/module/type.d.ts index 15eced56f..11b12eea0 100644 --- a/packages/global/core/module/type.d.ts +++ b/packages/global/core/module/type.d.ts @@ -25,7 +25,7 @@ export type moduleTemplateListType = { // store module type export type ModuleItemType = { name: string; - logo?: string; + avatar?: string; intro?: string; moduleId: string; position?: { diff --git a/packages/service/core/dataset/data/schema.ts b/packages/service/core/dataset/data/schema.ts index a7a51adcc..48826beb9 100644 --- a/packages/service/core/dataset/data/schema.ts +++ b/packages/service/core/dataset/data/schema.ts @@ -43,13 +43,9 @@ const DatasetDataSchema = new Schema({ type: String, default: '' }, - qToken: { + fullTextToken: { type: String, - default: '' - }, - aToken: { - type: String, - default: '' + required: true }, indexes: { type: [ @@ -82,7 +78,7 @@ try { DatasetDataSchema.index({ datasetId: 1 }); DatasetDataSchema.index({ collectionId: 1 }); // full text index - DatasetDataSchema.index({ qToken: 'text', aToken: 'text' }); + DatasetDataSchema.index({ fullTextToken: 'text' }); } catch (error) { console.log(error); } diff --git a/packages/service/package.json b/packages/service/package.json index db89876fc..09d622fa2 100644 --- a/packages/service/package.json +++ b/packages/service/package.json @@ -4,23 +4,23 @@ "dependencies": { "@fastgpt/global": "workspace:*", "axios": "^1.5.1", - "nextjs-cors": "^2.1.2", - "next": "13.5.2", "cookie": "^0.5.0", + "encoding": "^0.1.13", "jsonwebtoken": "^9.0.2", "mongoose": "^7.0.2", - "winston": "^3.10.0", - "winston-mongodb": "^5.1.1", - "tunnel": "^0.0.6", - "encoding": "^0.1.13", + "nanoid": "^4.0.1", + "next": "13.5.2", + "nextjs-cors": "^2.1.2", "pg": "^8.10.0", - "nanoid": "^4.0.1" + "tunnel": "^0.0.6", + "winston": "^3.10.0", + "winston-mongodb": "^5.1.1" }, "devDependencies": { - "@types/tunnel": "^0.0.4", - "@types/pg": "^8.6.6", - "@types/node": "^20.8.5", "@types/cookie": "^0.5.2", - "@types/jsonwebtoken": "^9.0.3" + "@types/jsonwebtoken": "^9.0.3", + "@types/node": "^20.8.5", + "@types/pg": "^8.6.6", + "@types/tunnel": "^0.0.4" } } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index ec2c0db8f..71c250c23 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -164,6 +164,9 @@ importers: '@mozilla/readability': specifier: ^0.4.4 version: registry.npmmirror.com/@mozilla/readability@0.4.4 + '@node-rs/jieba': + specifier: ^1.7.2 + version: registry.npmmirror.com/@node-rs/jieba@1.7.2 '@tanstack/react-query': specifier: ^4.24.10 version: registry.npmmirror.com/@tanstack/react-query@4.36.1(react-dom@18.2.0)(react@18.2.0) @@ -3686,6 +3689,174 @@ packages: requiresBuild: true optional: true + registry.npmmirror.com/@node-rs/jieba-android-arm-eabi@1.7.2: + resolution: {integrity: sha512-FyDHRNSRIHOQO7S6Q4RwuGffnnnuNwaXPH7K8WqSzifEY+zFIaSPcNqrZHrnqyeXc4JiYpBIHeP+0Mkf1kIGRA==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/@node-rs/jieba-android-arm-eabi/-/jieba-android-arm-eabi-1.7.2.tgz} + name: '@node-rs/jieba-android-arm-eabi' + version: 1.7.2 + engines: {node: '>= 10'} + cpu: [arm] + os: [android] + requiresBuild: true + dev: false + optional: true + + registry.npmmirror.com/@node-rs/jieba-android-arm64@1.7.2: + resolution: {integrity: sha512-z0UEZCGrAX/IiarhuDMsEIDZBS77UZv4SQyL/J48yrsbWKbb2lJ1vCrYxXIWqwp6auXHEu4r1O/pMriDAcEnPg==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/@node-rs/jieba-android-arm64/-/jieba-android-arm64-1.7.2.tgz} + name: '@node-rs/jieba-android-arm64' + version: 1.7.2 + engines: {node: '>= 10'} + cpu: [arm64] + os: [android] + requiresBuild: true + dev: false + optional: true + + registry.npmmirror.com/@node-rs/jieba-darwin-arm64@1.7.2: + resolution: {integrity: sha512-M2cHIWRaaOmXGKy446SH2+Y2PzREaI2oYznPbg55wYEdioUp01YS/2WRG8CaoCKEj0aUocA7MFM2vVcoIAsbQw==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/@node-rs/jieba-darwin-arm64/-/jieba-darwin-arm64-1.7.2.tgz} + name: '@node-rs/jieba-darwin-arm64' + version: 1.7.2 + engines: {node: '>= 10'} + cpu: [arm64] + os: [darwin] + requiresBuild: true + dev: false + optional: true + + registry.npmmirror.com/@node-rs/jieba-darwin-x64@1.7.2: + resolution: {integrity: sha512-euDawBU2FxB0CGTR803BA6WABsiicIrqa61z2AFFDPkJCDrauEM0jbMg3GDKLAvbaLbZ1Etu3QNN5xyroqp4Qw==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/@node-rs/jieba-darwin-x64/-/jieba-darwin-x64-1.7.2.tgz} + name: '@node-rs/jieba-darwin-x64' + version: 1.7.2 + engines: {node: '>= 10'} + cpu: [x64] + os: [darwin] + requiresBuild: true + dev: false + optional: true + + registry.npmmirror.com/@node-rs/jieba-freebsd-x64@1.7.2: + resolution: {integrity: sha512-vXCaYxPb90d/xTBVG+ZZXrFLXsO2719pZSyiZCL2tey+UY28U7MOoK6394Wwmf0FCB/eRTQMCKjVIUDi+IRMUg==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/@node-rs/jieba-freebsd-x64/-/jieba-freebsd-x64-1.7.2.tgz} + name: '@node-rs/jieba-freebsd-x64' + version: 1.7.2 + engines: {node: '>= 10'} + cpu: [x64] + os: [freebsd] + requiresBuild: true + dev: false + optional: true + + registry.npmmirror.com/@node-rs/jieba-linux-arm-gnueabihf@1.7.2: + resolution: {integrity: sha512-HTep79XlJYO3KRYZ2kJChG9HnYr1DKSQTB+HEYWKLK0ifphqybcxGNLAdH0S4dViG2ciD0+iN/refgtqZEidpw==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/@node-rs/jieba-linux-arm-gnueabihf/-/jieba-linux-arm-gnueabihf-1.7.2.tgz} + name: '@node-rs/jieba-linux-arm-gnueabihf' + version: 1.7.2 + engines: {node: '>= 10'} + cpu: [arm] + os: [linux] + requiresBuild: true + dev: false + optional: true + + registry.npmmirror.com/@node-rs/jieba-linux-arm64-gnu@1.7.2: + resolution: {integrity: sha512-P8QJdQydOVewL1MIqYiRpI7LOfrRQag+p4/hwExe+YXH8C7DOrR8rWJD/7XNRTbpOimlHq1UN/e+ZzhxQF/cLw==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/@node-rs/jieba-linux-arm64-gnu/-/jieba-linux-arm64-gnu-1.7.2.tgz} + name: '@node-rs/jieba-linux-arm64-gnu' + version: 1.7.2 + engines: {node: '>= 10'} + cpu: [arm64] + os: [linux] + libc: [glibc] + requiresBuild: true + dev: false + optional: true + + registry.npmmirror.com/@node-rs/jieba-linux-arm64-musl@1.7.2: + resolution: {integrity: sha512-WjnN0hmDvTXb2h3hMW5VnUGkK1xaqhs+WHfMMilau55+YN+YOYALKZ0TeBY4BapClLuBx54wqwmBX+B4hAXunQ==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/@node-rs/jieba-linux-arm64-musl/-/jieba-linux-arm64-musl-1.7.2.tgz} + name: '@node-rs/jieba-linux-arm64-musl' + version: 1.7.2 + engines: {node: '>= 10'} + cpu: [arm64] + os: [linux] + libc: [musl] + requiresBuild: true + dev: false + optional: true + + registry.npmmirror.com/@node-rs/jieba-linux-x64-gnu@1.7.2: + resolution: {integrity: sha512-gBXds/DwNSA6lNUxJjL6WIaNT6pnlM5juUgV/krLLkBJ8vXpOrQ07p0rrK1tnigz9b20xhsHaFRSwED1Y8zeXw==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/@node-rs/jieba-linux-x64-gnu/-/jieba-linux-x64-gnu-1.7.2.tgz} + name: '@node-rs/jieba-linux-x64-gnu' + version: 1.7.2 + engines: {node: '>= 10'} + cpu: [x64] + os: [linux] + libc: [glibc] + requiresBuild: true + dev: false + optional: true + + registry.npmmirror.com/@node-rs/jieba-linux-x64-musl@1.7.2: + resolution: {integrity: sha512-tNVD3SMuG5zAj7+bLS2Enio3zR7BPxi3PhQtpQ+Hv83jajIcN46QQ0EdoMFz/aB+hkQ9PlLAstu+VREFegs5EA==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/@node-rs/jieba-linux-x64-musl/-/jieba-linux-x64-musl-1.7.2.tgz} + name: '@node-rs/jieba-linux-x64-musl' + version: 1.7.2 + engines: {node: '>= 10'} + cpu: [x64] + os: [linux] + libc: [musl] + requiresBuild: true + dev: false + optional: true + + registry.npmmirror.com/@node-rs/jieba-win32-arm64-msvc@1.7.2: + resolution: {integrity: sha512-/e1iQ0Dh02lGPNCYTU/H3cfIsWydaGRzZ3TDj6GfWrxkWqXORL98x/VJ/C/uKLpc7GSLLd9ygyZG7SOAfKe2tA==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/@node-rs/jieba-win32-arm64-msvc/-/jieba-win32-arm64-msvc-1.7.2.tgz} + name: '@node-rs/jieba-win32-arm64-msvc' + version: 1.7.2 + engines: {node: '>= 10'} + cpu: [arm64] + os: [win32] + requiresBuild: true + dev: false + optional: true + + registry.npmmirror.com/@node-rs/jieba-win32-ia32-msvc@1.7.2: + resolution: {integrity: sha512-cYjA6YUiOwtuEzWErvwMMt/RETNWQDLcmAaiHA8ohsa6c0eB0kRJlQCc683tlaczZxqroY/7C9mxgJNGvoGRbw==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/@node-rs/jieba-win32-ia32-msvc/-/jieba-win32-ia32-msvc-1.7.2.tgz} + name: '@node-rs/jieba-win32-ia32-msvc' + version: 1.7.2 + engines: {node: '>= 10'} + cpu: [ia32] + os: [win32] + requiresBuild: true + dev: false + optional: true + + registry.npmmirror.com/@node-rs/jieba-win32-x64-msvc@1.7.2: + resolution: {integrity: sha512-2M+Um3woFF17sa8VBYQQ6E5PNMe9Kf9fdzmeDh/GzuNHXlxW4LyK9VTV8zchIv/bDNAR5Z85kfW4wASULUxvFQ==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/@node-rs/jieba-win32-x64-msvc/-/jieba-win32-x64-msvc-1.7.2.tgz} + name: '@node-rs/jieba-win32-x64-msvc' + version: 1.7.2 + engines: {node: '>= 10'} + cpu: [x64] + os: [win32] + requiresBuild: true + dev: false + optional: true + + registry.npmmirror.com/@node-rs/jieba@1.7.2: + resolution: {integrity: sha512-zGto08NDU+KWm670qVHYGTb0YTEJ0A97dwH3WCnnhyRYMqTbOXKC6OwTc/cjzfSJP1UDBSar9Ug9BlmWmEThWg==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/@node-rs/jieba/-/jieba-1.7.2.tgz} + name: '@node-rs/jieba' + version: 1.7.2 + engines: {node: '>= 10'} + optionalDependencies: + '@node-rs/jieba-android-arm-eabi': registry.npmmirror.com/@node-rs/jieba-android-arm-eabi@1.7.2 + '@node-rs/jieba-android-arm64': registry.npmmirror.com/@node-rs/jieba-android-arm64@1.7.2 + '@node-rs/jieba-darwin-arm64': registry.npmmirror.com/@node-rs/jieba-darwin-arm64@1.7.2 + '@node-rs/jieba-darwin-x64': registry.npmmirror.com/@node-rs/jieba-darwin-x64@1.7.2 + '@node-rs/jieba-freebsd-x64': registry.npmmirror.com/@node-rs/jieba-freebsd-x64@1.7.2 + '@node-rs/jieba-linux-arm-gnueabihf': registry.npmmirror.com/@node-rs/jieba-linux-arm-gnueabihf@1.7.2 + '@node-rs/jieba-linux-arm64-gnu': registry.npmmirror.com/@node-rs/jieba-linux-arm64-gnu@1.7.2 + '@node-rs/jieba-linux-arm64-musl': registry.npmmirror.com/@node-rs/jieba-linux-arm64-musl@1.7.2 + '@node-rs/jieba-linux-x64-gnu': registry.npmmirror.com/@node-rs/jieba-linux-x64-gnu@1.7.2 + '@node-rs/jieba-linux-x64-musl': registry.npmmirror.com/@node-rs/jieba-linux-x64-musl@1.7.2 + '@node-rs/jieba-win32-arm64-msvc': registry.npmmirror.com/@node-rs/jieba-win32-arm64-msvc@1.7.2 + '@node-rs/jieba-win32-ia32-msvc': registry.npmmirror.com/@node-rs/jieba-win32-ia32-msvc@1.7.2 + '@node-rs/jieba-win32-x64-msvc': registry.npmmirror.com/@node-rs/jieba-win32-x64-msvc@1.7.2 + dev: false + registry.npmmirror.com/@nodelib/fs.scandir@2.1.5: resolution: {integrity: sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==, registry: https://registry.npm.taobao.org/, tarball: https://registry.npmmirror.com/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz} name: '@nodelib/fs.scandir' diff --git a/projects/app/package.json b/projects/app/package.json index 1ddb71266..58c4fab8f 100644 --- a/projects/app/package.json +++ b/projects/app/package.json @@ -20,6 +20,7 @@ "@fastgpt/service": "workspace:*", "@fastgpt/web": "workspace:*", "@mozilla/readability": "^0.4.4", + "@node-rs/jieba": "^1.7.2", "@tanstack/react-query": "^4.24.10", "@types/nprogress": "^0.2.0", "axios": "^1.5.1", diff --git a/projects/app/public/locales/en/common.json b/projects/app/public/locales/en/common.json index 4c6249023..c722e81b0 100644 --- a/projects/app/public/locales/en/common.json +++ b/projects/app/public/locales/en/common.json @@ -261,6 +261,9 @@ "data": { "Edit": "Edit Data", "id": "Data ID" + }, + "test": { + "Test Result": "Results" } }, "module": { diff --git a/projects/app/public/locales/zh/common.json b/projects/app/public/locales/zh/common.json index c6d1e6cb7..53a384816 100644 --- a/projects/app/public/locales/zh/common.json +++ b/projects/app/public/locales/zh/common.json @@ -261,6 +261,9 @@ "data": { "Edit": "编辑数据", "id": "数据ID" + }, + "test": { + "Test Result": "测试结果" } }, "module": { diff --git a/projects/app/src/components/ChatBox/MessageInput.tsx b/projects/app/src/components/ChatBox/MessageInput.tsx index 7cfb68777..0ec024f6f 100644 --- a/projects/app/src/components/ChatBox/MessageInput.tsx +++ b/projects/app/src/components/ChatBox/MessageInput.tsx @@ -342,7 +342,7 @@ ${images.map((img) => JSON.stringify({ src: img.src })).join('\n')} const items = clipboardData.items; const files = Array.from(items) .map((item) => (item.kind === 'file' ? item.getAsFile() : undefined)) - .filter((item) => item) as File[]; + .filter(Boolean) as File[]; onSelectFile(files); } }} diff --git a/projects/app/src/components/ChatBox/ResponseTags.tsx b/projects/app/src/components/ChatBox/ResponseTags.tsx index 937735633..75938f12b 100644 --- a/projects/app/src/components/ChatBox/ResponseTags.tsx +++ b/projects/app/src/components/ChatBox/ResponseTags.tsx @@ -40,7 +40,7 @@ const ResponseTags = ({ responseData = [] }: { responseData?: ChatHistoryItemRes .filter((item) => item.moduleType === FlowNodeTypeEnum.chatNode) .map((item) => item.quoteList) .flat() - .filter((item) => item) as SearchDataResponseItemType[]; + .filter(Boolean) as SearchDataResponseItemType[]; const sourceList = quoteList.reduce( (acc: Record, cur) => { if (!acc[cur.sourceName]) { diff --git a/projects/app/src/global/core/api/datasetReq.d.ts b/projects/app/src/global/core/api/datasetReq.d.ts index 912f7dfdb..1872e1cc3 100644 --- a/projects/app/src/global/core/api/datasetReq.d.ts +++ b/projects/app/src/global/core/api/datasetReq.d.ts @@ -18,13 +18,6 @@ export type DatasetUpdateParams = { agentModel?: LLMModelItemType; }; -export type SearchTestProps = { - datasetId: string; - text: string; - limit?: number; - rerank?: boolean; -}; - /* ======= collections =========== */ export type GetDatasetCollectionsProps = RequestPaging & { datasetId: string; diff --git a/projects/app/src/global/core/dataset/api.d.ts b/projects/app/src/global/core/dataset/api.d.ts index a888c500a..6c25dbeb8 100644 --- a/projects/app/src/global/core/dataset/api.d.ts +++ b/projects/app/src/global/core/dataset/api.d.ts @@ -1,6 +1,9 @@ import { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api'; import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constant'; -import { DatasetDataIndexItemType } from '@fastgpt/global/core/dataset/type'; +import { + DatasetDataIndexItemType, + SearchDataResponseItemType +} from '@fastgpt/global/core/dataset/type'; /* ================= dataset ===================== */ export type CreateDatasetParams = { @@ -34,3 +37,15 @@ export type UpdateDatasetDataProps = { dataId?: string; // pg data id })[]; }; + +/* -------------- search ---------------- */ +export type SearchTestProps = { + datasetId: string; + text: string; + limit?: number; + rerank?: boolean; +}; +export type SearchTestResponse = { + list: SearchDataResponseItemType[]; + duration: string; +}; diff --git a/projects/app/src/pages/_app.tsx b/projects/app/src/pages/_app.tsx index eb515175f..98a154e61 100644 --- a/projects/app/src/pages/_app.tsx +++ b/projects/app/src/pages/_app.tsx @@ -93,7 +93,7 @@ function App({ Component, pageProps }: AppProps) { return ( <> - {feConfigs?.systemTitle || process.env.SYSTEM_NAME || 'GPT'} + {feConfigs?.systemTitle || process.env.SYSTEM_NAME || ''} (` diff --git a/projects/app/src/pages/api/admin/initv462.ts b/projects/app/src/pages/api/admin/initv462.ts index 7720ad1e9..d734588ca 100644 --- a/projects/app/src/pages/api/admin/initv462.ts +++ b/projects/app/src/pages/api/admin/initv462.ts @@ -2,18 +2,9 @@ import type { NextApiRequest, NextApiResponse } from 'next'; import { jsonRes } from '@fastgpt/service/common/response'; import { connectToDatabase } from '@/service/mongo'; import { delay } from '@/utils/tools'; -import { PgClient } from '@fastgpt/service/common/pg'; -import { - DatasetDataIndexTypeEnum, - PgDatasetTableName -} from '@fastgpt/global/core/dataset/constant'; - import { authCert } from '@fastgpt/service/support/permission/auth/common'; import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema'; -import { getUserDefaultTeam } from '@fastgpt/service/support/user/team/controller'; -import { MongoDataset } from '@fastgpt/service/core/dataset/schema'; -import { defaultQAModels } from '@fastgpt/global/core/ai/model'; -import { MongoApp } from '@fastgpt/service/core/app/schema'; +import { jiebaSplit } from '@/service/core/dataset/utils'; let success = 0; /* pg 中的数据搬到 mongo dataset.datas 中,并做映射 */ @@ -22,6 +13,12 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse) const { limit = 50 } = req.body as { limit: number }; await authCert({ req, authRoot: true }); await connectToDatabase(); + success = 0; + + console.log( + 'total', + await MongoDatasetData.countDocuments({ fullTextToken: { $exists: false } }) + ); await initFullTextToken(limit); @@ -37,4 +34,31 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse) }); } } -export async function initFullTextToken(limit = 50) {} +export async function initFullTextToken(limit = 50): Promise { + try { + const dataList = await MongoDatasetData.find({ fullTextToken: { $exists: false } }, '_id q a') + .limit(limit) + .lean(); + if (dataList.length === 0) return; + + const result = await Promise.allSettled( + dataList.map((item) => { + const text = item.q + (item.a || ''); + const tokens = jiebaSplit({ text }); + + return MongoDatasetData.findByIdAndUpdate(item._id, { + $set: { + fullTextToken: tokens + } + }); + }) + ); + + success += result.filter((item) => item.status === 'fulfilled').length; + console.log(`success: ${success}`); + return initFullTextToken(limit); + } catch (error) { + await delay(1000); + return initFullTextToken(limit); + } +} diff --git a/projects/app/src/pages/api/core/app/create.ts b/projects/app/src/pages/api/core/app/create.ts index 9831070c3..e1480945a 100644 --- a/projects/app/src/pages/api/core/app/create.ts +++ b/projects/app/src/pages/api/core/app/create.ts @@ -6,6 +6,7 @@ import type { CreateAppParams } from '@fastgpt/global/core/app/api.d'; import { AppTypeEnum } from '@fastgpt/global/core/app/constants'; import { MongoApp } from '@fastgpt/service/core/app/schema'; import { authUserNotVisitor } from '@fastgpt/service/support/permission/auth/user'; +import { SimpleModeTemplate_FastGPT_Universal } from '@/global/core/app/constants'; export default async function handler(req: NextApiRequest, res: NextApiResponse) { try { @@ -39,7 +40,8 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse< teamId, tmbId, modules, - type + type, + simpleTemplateId: SimpleModeTemplate_FastGPT_Universal.id }); jsonRes(res, { diff --git a/projects/app/src/pages/api/core/app/form2Modules/fastgpt-simple.ts b/projects/app/src/pages/api/core/app/form2Modules/fastgpt-simple.ts index d1b252588..5ee44bcf2 100644 --- a/projects/app/src/pages/api/core/app/form2Modules/fastgpt-simple.ts +++ b/projects/app/src/pages/api/core/app/form2Modules/fastgpt-simple.ts @@ -41,7 +41,7 @@ function simpleChatTemplate({ { moduleId: 'userChatInput', name: '用户问题(对话入口)', - logo: '/imgs/module/userChatInput.png', + avatar: '/imgs/module/userChatInput.png', flowType: 'questionInput', position: { x: 464.32198615344566, @@ -73,7 +73,7 @@ function simpleChatTemplate({ { moduleId: 'history', name: '聊天记录', - logo: '/imgs/module/history.png', + avatar: '/imgs/module/history.png', flowType: 'historyNode', position: { x: 452.5466249541586, @@ -114,7 +114,7 @@ function simpleChatTemplate({ { moduleId: 'chatModule', name: 'AI 对话', - logo: '/imgs/module/AI.png', + avatar: '/imgs/module/AI.png', flowType: 'chatNode', showStatus: true, position: { @@ -284,7 +284,7 @@ function datasetTemplate({ { moduleId: 'userChatInput', name: '用户问题(对话入口)', - logo: '/imgs/module/userChatInput.png', + avatar: '/imgs/module/userChatInput.png', flowType: 'questionInput', position: { x: 464.32198615344566, @@ -320,7 +320,7 @@ function datasetTemplate({ { moduleId: 'history', name: '聊天记录', - logo: '/imgs/module/history.png', + avatar: '/imgs/module/history.png', flowType: 'historyNode', position: { x: 452.5466249541586, @@ -361,7 +361,7 @@ function datasetTemplate({ { moduleId: 'datasetSearch', name: '知识库搜索', - logo: '/imgs/module/db.png', + avatar: '/imgs/module/db.png', flowType: 'datasetSearchNode', showStatus: true, position: { @@ -454,7 +454,7 @@ function datasetTemplate({ { moduleId: 'chatModule', name: 'AI 对话', - logo: '/imgs/module/AI.png', + avatar: '/imgs/module/AI.png', flowType: 'chatNode', showStatus: true, position: { diff --git a/projects/app/src/pages/api/core/dataset/data/insertData.ts b/projects/app/src/pages/api/core/dataset/data/insertData.ts index 1582f1bb2..f7de1e63e 100644 --- a/projects/app/src/pages/api/core/dataset/data/insertData.ts +++ b/projects/app/src/pages/api/core/dataset/data/insertData.ts @@ -15,6 +15,7 @@ import { getCollectionWithDataset } from '@fastgpt/service/core/dataset/controll import { authTeamBalance } from '@/service/support/permission/auth/bill'; import { pushGenerateVectorBill } from '@/service/support/wallet/bill/push'; import { InsertOneDatasetDataProps } from '@/global/core/dataset/api'; +import { simpleText } from '@fastgpt/global/common/string/tools'; export default withNextCors(async function handler(req: NextApiRequest, res: NextApiResponse) { try { @@ -46,8 +47,12 @@ export default withNextCors(async function handler(req: NextApiRequest, res: Nex ] = await Promise.all([getCollectionWithDataset(collectionId), authTeamBalance(teamId)]); // format data - const formatQ = q.replace(/\\n/g, '\n').trim().replace(/'/g, '"'); - const formatA = a?.replace(/\\n/g, '\n').trim().replace(/'/g, '"') || ''; + const formatQ = simpleText(q); + const formatA = simpleText(a); + const formatIndexes = indexes?.map((item) => ({ + ...item, + text: simpleText(item.text) + })); // token check const token = countPromptTokens(formatQ, 'system'); @@ -72,7 +77,7 @@ export default withNextCors(async function handler(req: NextApiRequest, res: Nex q: formatQ, a: formatA, model: vectorModelData.model, - indexes + indexes: formatIndexes }); pushGenerateVectorBill({ diff --git a/projects/app/src/pages/api/core/dataset/data/pushData.ts b/projects/app/src/pages/api/core/dataset/data/pushData.ts index 7ed069bff..3329bcd6e 100644 --- a/projects/app/src/pages/api/core/dataset/data/pushData.ts +++ b/projects/app/src/pages/api/core/dataset/data/pushData.ts @@ -13,6 +13,7 @@ import { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api'; import { getQAModel, getVectorModel } from '@/service/core/ai/model'; import { authDatasetCollection } from '@fastgpt/service/support/permission/auth/dataset'; import { getCollectionWithDataset } from '@fastgpt/service/core/dataset/controller'; +import { simpleText } from '@fastgpt/global/common/string/tools'; export default withNextCors(async function handler(req: NextApiRequest, res: NextApiResponse) { try { @@ -72,6 +73,21 @@ export async function pushDataToDatasetCollection({ collectionId }); + // format q and a, remove empty char + data.forEach((item) => { + item.q = simpleText(item.q); + item.a = simpleText(item.a); + + item.indexes = item.indexes + ?.map((index) => { + return { + ...index, + text: simpleText(index.text) + }; + }) + .filter(Boolean); + }); + // filter repeat or equal content const set = new Set(); const filterResult: Record = { diff --git a/projects/app/src/pages/api/core/dataset/searchTest.ts b/projects/app/src/pages/api/core/dataset/searchTest.ts index e16bd5334..aae2a02d6 100644 --- a/projects/app/src/pages/api/core/dataset/searchTest.ts +++ b/projects/app/src/pages/api/core/dataset/searchTest.ts @@ -1,9 +1,8 @@ import type { NextApiRequest, NextApiResponse } from 'next'; import { jsonRes } from '@fastgpt/service/common/response'; import { withNextCors } from '@fastgpt/service/common/middle/cors'; -import type { SearchTestProps } from '@/global/core/api/datasetReq.d'; +import type { SearchTestProps, SearchTestResponse } from '@/global/core/dataset/api.d'; import { connectToDatabase } from '@/service/mongo'; -import type { SearchDataResponseItemType } from '@fastgpt/global/core/dataset/type'; import { authDataset } from '@fastgpt/service/support/permission/auth/dataset'; import { authTeamBalance } from '@/service/support/permission/auth/bill'; import { pushGenerateVectorBill } from '@/service/support/wallet/bill/push'; @@ -22,6 +21,8 @@ export default withNextCors(async function handler(req: NextApiRequest, res: Nex throw new Error('缺少参数'); } + const start = Date.now(); + // auth dataset role const { dataset, teamId, tmbId, apikey } = await authDataset({ req, @@ -61,8 +62,11 @@ export default withNextCors(async function handler(req: NextApiRequest, res: Nex }); } - jsonRes(res, { - data: searchRes + jsonRes(res, { + data: { + list: searchRes, + duration: `${((Date.now() - start) / 1000).toFixed(3)}s` + } }); } catch (err) { jsonRes(res, { diff --git a/projects/app/src/pages/api/core/plugin/detail.ts b/projects/app/src/pages/api/core/plugin/detail.ts index 112907ab0..f38392f24 100644 --- a/projects/app/src/pages/api/core/plugin/detail.ts +++ b/projects/app/src/pages/api/core/plugin/detail.ts @@ -1,17 +1,16 @@ import type { NextApiRequest, NextApiResponse } from 'next'; import { jsonRes } from '@fastgpt/service/common/response'; import { connectToDatabase } from '@/service/mongo'; -import { MongoPlugin } from '@fastgpt/service/core/plugin/schema'; import { authPluginCrud } from '@fastgpt/service/support/permission/auth/plugin'; export default async function handler(req: NextApiRequest, res: NextApiResponse) { try { const { id } = req.query as { id: string }; await connectToDatabase(); - await authPluginCrud({ req, authToken: true, id, per: 'r' }); + const { plugin } = await authPluginCrud({ req, authToken: true, id, per: 'r' }); jsonRes(res, { - data: await MongoPlugin.findOne({ id }) + data: plugin }); } catch (err) { jsonRes(res, { diff --git a/projects/app/src/pages/dataset/detail/components/Info.tsx b/projects/app/src/pages/dataset/detail/components/Info.tsx index 9fadd3230..d5c568d80 100644 --- a/projects/app/src/pages/dataset/detail/components/Info.tsx +++ b/projects/app/src/pages/dataset/detail/components/Info.tsx @@ -230,16 +230,13 @@ const Info = ( placeholder={'标签,使用空格分割。'} maxLength={30} onChange={(e) => { - setValue( - 'tags', - e.target.value.split(' ').filter((item) => item) - ); + setValue('tags', e.target.value.split(' ').filter(Boolean)); setRefresh(!refresh); }} /> {getValues('tags') - .filter((item) => item) + .filter(Boolean) .map((item, i) => ( {item} diff --git a/projects/app/src/pages/dataset/detail/components/Test.tsx b/projects/app/src/pages/dataset/detail/components/Test.tsx index 294fac5bc..0e076cc0d 100644 --- a/projects/app/src/pages/dataset/detail/components/Test.tsx +++ b/projects/app/src/pages/dataset/detail/components/Test.tsx @@ -16,6 +16,7 @@ import { QuestionOutlineIcon } from '@chakra-ui/icons'; import { SearchDataResponseItemType } from '@fastgpt/global/core/dataset/type'; import { useTranslation } from 'next-i18next'; import { feConfigs } from '@/web/common/system/staticData'; +import { SearchTestResponse } from '../../../../global/core/dataset/api'; const nanoid = customAlphabet('abcdefghijklmnopqrstuvwxyz1234567890', 12); const Test = ({ datasetId }: { datasetId: string }) => { @@ -37,20 +38,21 @@ const Test = ({ datasetId }: { datasetId: string }) => { ); const { mutate, isLoading } = useRequest({ - mutationFn: () => postSearchText({ datasetId, text: inputText.trim(), rerank, limit: 20 }), - onSuccess(res: SearchDataResponseItemType[]) { - if (!res || res.length === 0) { + mutationFn: () => postSearchText({ datasetId, text: inputText.trim(), rerank, limit: 30 }), + onSuccess(res: SearchTestResponse) { + if (!res || res.list.length === 0) { return toast({ status: 'warning', title: t('dataset.test.noResult') }); } - const testItem = { + const testItem: SearchTestStoreItemType = { id: nanoid(), datasetId, text: inputText.trim(), time: new Date(), - results: res + results: res.list, + duration: res.duration }; pushDatasetTestItem(testItem); setDatasetTestItem(testItem); @@ -176,7 +178,7 @@ const Test = ({ datasetId }: { datasetId: string }) => { <> - 测试结果 + {t('core.dataset.test.Test Result')} { forceShow > + ({datasetTestItem.duration}) void; parentId?: st {getValues('tags') .split(' ') - .filter((item) => item) + .filter(Boolean) .map((item, i) => ( {item} diff --git a/projects/app/src/pages/dataset/list/index.tsx b/projects/app/src/pages/dataset/list/index.tsx index 7dddd59b1..4235d8c38 100644 --- a/projects/app/src/pages/dataset/list/index.tsx +++ b/projects/app/src/pages/dataset/list/index.tsx @@ -390,13 +390,11 @@ const Kb = () => { - {dataset.tags - .filter((item) => item) - .map((tag, i) => ( - - {tag} - - ))} + {dataset.tags.filter(Boolean).map((tag, i) => ( + + {tag} + + ))} diff --git a/projects/app/src/pages/plugin/edit/Preview.tsx b/projects/app/src/pages/plugin/edit/Preview.tsx index ecb5d2599..5af2eabe3 100644 --- a/projects/app/src/pages/plugin/edit/Preview.tsx +++ b/projects/app/src/pages/plugin/edit/Preview.tsx @@ -34,7 +34,7 @@ const PreviewPlugin = ({ item: { moduleId: 'plugin', flowType: FlowNodeTypeEnum.pluginModule, - logo: plugin.avatar, + avatar: plugin.avatar, name: plugin.name, intro: plugin.intro, ...formatPluginToPreviewModule(plugin._id, modules) diff --git a/projects/app/src/pages/plugin/edit/index.tsx b/projects/app/src/pages/plugin/edit/index.tsx index ff6b737e3..5e0acbea7 100644 --- a/projects/app/src/pages/plugin/edit/index.tsx +++ b/projects/app/src/pages/plugin/edit/index.tsx @@ -48,27 +48,32 @@ const Render = ({ pluginId }: Props) => { return copyTemplates; }, [nodes]); - const { data } = useQuery(['getOnePlugin', pluginId], () => getOnePlugin(pluginId), { - onError: (error) => { - toast({ - status: 'warning', - title: getErrText(error, t('plugin.Load Plugin Failed')) - }); - router.replace('/plugin/list'); + const { data: pluginDetail } = useQuery( + ['getOnePlugin', pluginId], + () => getOnePlugin(pluginId), + { + onError: (error) => { + toast({ + status: 'warning', + title: getErrText(error, t('plugin.Load Plugin Failed')) + }); + router.replace('/plugin/list'); + } } - }); + ); + console.log(pluginDetail); useQuery(['getPlugTemplates'], () => loadPluginTemplates()); const filterPlugins = useMemo(() => { return pluginModuleTemplates.filter((item) => item.id !== pluginId); }, [pluginId, pluginModuleTemplates]); - return data ? ( + return pluginDetail ? ( router.back()} />} + modules={pluginDetail?.modules || []} + Header={
router.back()} />} /> ) : ( diff --git a/projects/app/src/pages/plugin/list/component/EditModal.tsx b/projects/app/src/pages/plugin/list/component/EditModal.tsx index 71f0696d2..eca421d4a 100644 --- a/projects/app/src/pages/plugin/list/component/EditModal.tsx +++ b/projects/app/src/pages/plugin/list/component/EditModal.tsx @@ -24,17 +24,81 @@ import MyModal from '@/components/MyModal'; import { useTranslation } from 'next-i18next'; import { useConfirm } from '@/web/common/hooks/useConfirm'; import MyIcon from '@/components/Icon'; +import { CreateOnePluginParams } from '@fastgpt/global/core/plugin/controller'; -export type FormType = { +export type FormType = CreateOnePluginParams & { id?: string; - avatar: string; - name: string; - intro: string; }; -export const defaultForm = { +export const defaultForm: FormType = { avatar: '/icon/logo.svg', name: '', - intro: '' + intro: '', + modules: [ + { + moduleId: 'w90mfp', + name: '定义插件输入', + avatar: '/imgs/module/input.png', + flowType: 'pluginInput', + showStatus: false, + position: { + x: 616.4226348688949, + y: -165.05298493910115 + }, + inputs: [ + { + key: 'question', + valueType: 'string', + type: 'target', + label: '用户问题', + required: true, + edit: true, + connected: false + } + ], + outputs: [ + { + key: 'question', + valueType: 'string', + label: '用户问题', + type: 'source', + edit: true, + targets: [] + } + ] + }, + { + moduleId: 'tze1ju', + name: '定义插件输出', + avatar: '/imgs/module/output.png', + flowType: 'pluginOutput', + showStatus: false, + position: { + x: 1607.7142331269126, + y: -151.8669210746189 + }, + inputs: [ + { + key: 'answer', + type: 'target', + valueType: 'string', + label: '答案', + required: true, + edit: true, + connected: true + } + ], + outputs: [ + { + key: 'answer', + valueType: 'string', + label: '答案', + type: 'source', + edit: true, + targets: [] + } + ] + } + ] }; const CreateModal = ({ diff --git a/projects/app/src/service/core/app/module.ts b/projects/app/src/service/core/app/module.ts index 50f704ae2..8449d71d6 100644 --- a/projects/app/src/service/core/app/module.ts +++ b/projects/app/src/service/core/app/module.ts @@ -8,5 +8,5 @@ export const getChatModelNameListByModules = (modules: ModuleItemType[]): string const model = item.inputs.find((input) => input.key === 'model')?.value; return global.chatModels.find((item) => item.model === model)?.name || ''; }) - .filter((item) => item); + .filter(Boolean); }; diff --git a/projects/app/src/service/core/dataset/data/controller.ts b/projects/app/src/service/core/dataset/data/controller.ts index f04a1e8a5..353d0814e 100644 --- a/projects/app/src/service/core/dataset/data/controller.ts +++ b/projects/app/src/service/core/dataset/data/controller.ts @@ -8,6 +8,7 @@ import { deletePgDataById, insertData2Pg, updatePgDataById } from './pg'; import { Types } from 'mongoose'; import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/constant'; import { getDefaultIndex } from '@fastgpt/global/core/dataset/utils'; +import { jiebaSplit } from '../utils'; /* insert data. * 1. create data id @@ -34,9 +35,6 @@ export async function insertData2Dataset({ return Promise.reject("teamId and tmbId can't be the same"); } - q = q.trim(); - a = a.trim(); - const id = new Types.ObjectId(); const qaStr = `${q}\n${a}`.trim(); @@ -74,6 +72,7 @@ export async function insertData2Dataset({ collectionId, q, a, + fullTextToken: jiebaSplit({ text: q + a }), indexes: indexes.map((item, i) => ({ ...item, dataId: result[i].insertId @@ -203,6 +202,7 @@ export async function updateData2Dataset({ // update mongo mongoData.q = q || mongoData.q; mongoData.a = a ?? mongoData.a; + mongoData.fullTextToken = jiebaSplit({ text: mongoData.q + mongoData.a }); // @ts-ignore mongoData.indexes = indexes; await mongoData.save(); diff --git a/projects/app/src/service/core/dataset/data/pg.ts b/projects/app/src/service/core/dataset/data/pg.ts index 94c8f3563..30ef5beb7 100644 --- a/projects/app/src/service/core/dataset/data/pg.ts +++ b/projects/app/src/service/core/dataset/data/pg.ts @@ -1,5 +1,8 @@ import { PgDatasetTableName } from '@fastgpt/global/core/dataset/constant'; -import type { SearchDataResponseItemType } from '@fastgpt/global/core/dataset/type.d'; +import type { + DatasetDataWithCollectionType, + SearchDataResponseItemType +} from '@fastgpt/global/core/dataset/type.d'; import { PgClient } from '@fastgpt/service/common/pg'; import { getVectorsByText } from '@/service/core/ai/vector'; import { delay } from '@/utils/tools'; @@ -8,6 +11,7 @@ import { MongoDatasetCollection } from '@fastgpt/service/core/dataset/collection import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema'; import { POST } from '@fastgpt/service/common/api/plusRequest'; import { PostReRankResponse } from '@fastgpt/global/core/ai/api'; +import { jiebaSplit } from '../utils'; export async function insertData2Pg({ mongoDataId, @@ -125,39 +129,100 @@ export async function deletePgDataById( }; } -// search -export async function searchDatasetData({ - text, - model, - similarity = 0, - limit, - datasetIds = [], - rerank = false -}: { +// ------------------ search start ------------------ +type SearchProps = { text: string; model: string; similarity?: number; // min distance limit: number; datasetIds: string[]; rerank?: boolean; -}) { +}; +export async function searchDatasetData(props: SearchProps) { + const { text, similarity = 0, limit, rerank = false } = props; + + const [{ tokenLen, embeddingRecallResults }, { fullTextRecallResults }] = await Promise.all([ + embeddingRecall({ + ...props, + limit: rerank ? Math.max(50, limit * 3) : limit * 2 + }), + fullTextRecall({ + ...props, + limit: 40 + }) + ]); + + // concat recall result + let set = new Set(); + const concatRecallResults = embeddingRecallResults; + for (const item of fullTextRecallResults) { + if (!set.has(item.id)) { + concatRecallResults.push(item); + set.add(item.id); + } + } + + // remove same q and a data + set = new Set(); + const filterSameDataResults = concatRecallResults.filter((item) => { + const str = `${item.q}${item.a}`.trim(); + if (set.has(str)) return false; + set.add(str); + return true; + }); + + if (!rerank) { + return { + searchRes: filterSameDataResults.slice(0, limit), + tokenLen + }; + } + + // ReRank result + const reRankResults = await reRankSearchResult({ + query: text, + data: filterSameDataResults + }); + + // similarity filter + const filterReRankResults = reRankResults.filter((item) => item.score > similarity); + + // concat rerank and embedding data + set = new Set(filterReRankResults.map((item) => item.id)); + const concatResult = filterReRankResults.concat( + filterSameDataResults.filter((item) => { + if (set.has(item.id)) return false; + set.add(item.id); + return true; + }) + ); + + return { + searchRes: concatResult.slice(0, limit), + tokenLen + }; +} +export async function embeddingRecall({ + text, + model, + similarity = 0, + limit, + datasetIds = [], + rerank = false +}: SearchProps) { const { vectors, tokenLen } = await getVectorsByText({ model, input: [text] }); - const minLimit = global.systemEnv.pluginBaseUrl ? Math.max(50, limit * 4) : limit * 2; - const results: any = await PgClient.query( `BEGIN; SET LOCAL hnsw.ef_search = ${global.systemEnv.pgHNSWEfSearch || 100}; - select id, collection_id, data_id, (vector <#> '[${ - vectors[0] - }]') * -1 AS score from ${PgDatasetTableName} - where dataset_id IN (${datasetIds.map((id) => `'${String(id)}'`).join(',')}) AND vector <#> '[${ - vectors[0] - }]' < -${similarity} - order by score desc limit ${minLimit}; + select id, collection_id, data_id, (vector <#> '[${vectors[0]}]') * -1 AS score + from ${PgDatasetTableName} + where dataset_id IN (${datasetIds.map((id) => `'${String(id)}'`).join(',')}) + ${rerank ? '' : `AND vector <#> '[${vectors[0]}]' < -${similarity}`} + order by score desc limit ${limit}; COMMIT;` ); @@ -212,47 +277,54 @@ export async function searchDatasetData({ }) .filter((item) => item !== null) as SearchDataResponseItemType[]; - // remove same q and a data - set = new Set(); - const filterData = formatResult.filter((item) => { - const str = `${item.q}${item.a}`.trim(); - if (set.has(str)) return false; - set.add(str); - return true; - }); - - if (!rerank) { - return { - searchRes: filterData.slice(0, limit), - tokenLen - }; - } - - // ReRank result - const reRankResult = await reRankSearchResult({ - query: text, - data: filterData - }); - - // similarity filter - const filterReRankResult = reRankResult.filter((item) => item.score > similarity); - - // concat rerank and embedding data - set = new Set(filterReRankResult.map((item) => item.id)); - const concatResult = filterReRankResult.concat( - filterData.filter((item) => { - if (set.has(item.id)) return false; - set.add(item.id); - return true; - }) - ); - return { - searchRes: concatResult.slice(0, limit), + embeddingRecallResults: formatResult, tokenLen }; } +export async function fullTextRecall({ + text, + limit, + datasetIds = [], + rerank = false +}: SearchProps): Promise<{ + fullTextRecallResults: SearchDataResponseItemType[]; + tokenLen: number; +}> { + if (!rerank) { + return { + fullTextRecallResults: [], + tokenLen: 0 + }; + } + const result = (await MongoDatasetData.find( + { + datasetId: { $in: datasetIds.map((item) => item) }, + $text: { $search: jiebaSplit({ text }) } + }, + { score: { $meta: 'textScore' } } + ) + .sort({ score: { $meta: 'textScore' } }) + .limit(limit) + .populate('collectionId') + .lean()) as DatasetDataWithCollectionType[]; + + return { + fullTextRecallResults: result.map((item) => ({ + id: String(item._id), + datasetId: String(item.datasetId), + collectionId: String(item.collectionId._id), + sourceName: item.collectionId.name || '', + sourceId: item.collectionId.metadata?.fileId || item.collectionId.metadata?.rawLink, + q: item.q, + a: item.a, + indexes: item.indexes, + score: 1 + })), + tokenLen: 0 + }; +} // plus reRank search result export async function reRankSearchResult({ data, @@ -279,7 +351,7 @@ export async function reRankSearchResult({ score: item.score ?? target.score }; }) - .filter((item) => item) as SearchDataResponseItemType[]; + .filter(Boolean) as SearchDataResponseItemType[]; return mergeResult; } catch (error) { @@ -288,3 +360,4 @@ export async function reRankSearchResult({ return data; } } +// ------------------ search end ------------------ diff --git a/projects/app/src/service/core/dataset/utils.ts b/projects/app/src/service/core/dataset/utils.ts new file mode 100644 index 000000000..332cade11 --- /dev/null +++ b/projects/app/src/service/core/dataset/utils.ts @@ -0,0 +1,34 @@ +import { MongoDatasetData } from '@fastgpt/service/core/dataset/data/schema'; +import { cut, extract } from '@node-rs/jieba'; + +/** + * Same value judgment + */ +export async function hasSameValue({ + collectionId, + q, + a = '' +}: { + collectionId: string; + q: string; + a?: string; +}) { + const count = await MongoDatasetData.countDocuments({ + q, + a, + collectionId + }); + + if (count > 0) { + return Promise.reject('已经存在完全一致的数据'); + } +} + +export function jiebaSplit({ text }: { text: string }) { + const tokens = cut(text, true); + + return tokens + .map((item) => item.replace(/[^\u4e00-\u9fa5a-zA-Z0-9\s]/g, '').trim()) + .filter(Boolean) + .join(' '); +} diff --git a/projects/app/src/service/events/generateQA.ts b/projects/app/src/service/events/generateQA.ts index aedfb5821..1d9ebf078 100644 --- a/projects/app/src/service/events/generateQA.ts +++ b/projects/app/src/service/events/generateQA.ts @@ -13,8 +13,15 @@ import { getErrText } from '@fastgpt/global/common/error/utils'; import { authTeamBalance } from '../support/permission/auth/bill'; import type { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api.d'; -const reduceQueue = () => { +const reduceQueue = (retry = false) => { global.qaQueueLen = global.qaQueueLen > 0 ? global.qaQueueLen - 1 : 0; + if (global.qaQueueLen === 0 && retry) { + setTimeout(() => { + generateQA(); + }, 60000); + } + + return global.vectorQueueLen === 0; }; export async function generateQA(): Promise { @@ -32,7 +39,7 @@ export async function generateQA(): Promise { const data = await MongoDatasetTraining.findOneAndUpdate( { mode: TrainingModeEnum.qa, - lockTime: { $lte: new Date(Date.now() - 10 * 60 * 1000) } + lockTime: { $lte: new Date(Date.now() - 6 * 60 * 1000) } }, { lockTime: new Date() @@ -70,12 +77,13 @@ export async function generateQA(): Promise { } })(); - if (done) { - reduceQueue(); - global.vectorQueueLen <= 0 && console.log(`【QA】Task Done`); + if (done || !data) { + if (reduceQueue()) { + console.log(`【QA】Task Done`); + } return; } - if (error || !data) { + if (error) { reduceQueue(); return generateQA(); } @@ -171,7 +179,7 @@ export async function generateQA(): Promise { reduceQueue(); generateQA(); } catch (err: any) { - reduceQueue(); + reduceQueue(true); // log if (err?.response) { addLog.info('openai error: 生成QA错误', { diff --git a/projects/app/src/service/events/generateVector.ts b/projects/app/src/service/events/generateVector.ts index 9543157d1..9374e3a6d 100644 --- a/projects/app/src/service/events/generateVector.ts +++ b/projects/app/src/service/events/generateVector.ts @@ -7,8 +7,16 @@ import { getErrText } from '@fastgpt/global/common/error/utils'; import { authTeamBalance } from '@/service/support/permission/auth/bill'; import { pushGenerateVectorBill } from '@/service/support/wallet/bill/push'; -const reduceQueue = () => { +const reduceQueue = (retry = false) => { global.vectorQueueLen = global.vectorQueueLen > 0 ? global.vectorQueueLen - 1 : 0; + + if (global.vectorQueueLen === 0 && retry) { + setTimeout(() => { + generateVector(); + }, 60000); + } + + return global.vectorQueueLen === 0; }; /* 索引生成队列。每导入一次,就是一个单独的线程 */ @@ -57,8 +65,8 @@ export async function generateVector(): Promise { return { data, dataItem: { - q: data.q.replace(/[\x00-\x08]/g, ' '), - a: data.a?.replace(/[\x00-\x08]/g, ' ') || '', + q: data.q, + a: data.a || '', indexes: data.indexes } }; @@ -70,12 +78,13 @@ export async function generateVector(): Promise { } })(); - if (done) { - reduceQueue(); - global.vectorQueueLen <= 0 && console.log(`【index】Task done`); + if (done || !data) { + if (reduceQueue()) { + console.log(`【index】Task done`); + } return; } - if (error || !data) { + if (error) { reduceQueue(); return generateVector(); } @@ -108,8 +117,15 @@ export async function generateVector(): Promise { } // create vector and insert - try { + // invalid data + if (!data.q.trim()) { + await MongoDatasetTraining.findByIdAndDelete(data._id); + reduceQueue(); + generateVector(); + return; + } + // insert data to pg const { tokenLen } = await insertData2Dataset({ teamId: data.teamId, @@ -135,7 +151,7 @@ export async function generateVector(): Promise { reduceQueue(); generateVector(); } catch (err: any) { - reduceQueue(); + reduceQueue(true); // log if (err?.response) { addLog.info('openai error: 生成向量错误', { diff --git a/projects/app/src/utils/sse.ts b/projects/app/src/utils/sse.ts index 275fbe457..ec9309f04 100644 --- a/projects/app/src/utils/sse.ts +++ b/projects/app/src/utils/sse.ts @@ -2,7 +2,7 @@ const decoder = new TextDecoder(); export const parseStreamChunk = (value: BufferSource) => { const chunk = decoder.decode(value); - const chunkLines = chunk.split('\n\n').filter((item) => item); + const chunkLines = chunk.split('\n\n').filter(Boolean); const chunkResponse = chunkLines.map((item) => { const splitEvent = item.split('\n'); if (splitEvent.length === 2) { diff --git a/projects/app/src/web/core/dataset/api.ts b/projects/app/src/web/core/dataset/api.ts index d5a1635b8..08d47f894 100644 --- a/projects/app/src/web/core/dataset/api.ts +++ b/projects/app/src/web/core/dataset/api.ts @@ -3,12 +3,12 @@ import type { ParentTreePathItemType } from '@fastgpt/global/common/parentFolder import type { DatasetItemType } from '@fastgpt/global/core/dataset/type.d'; import type { DatasetUpdateParams, - SearchTestProps, GetDatasetCollectionsProps, GetDatasetDataListProps, CreateDatasetCollectionParams, UpdateDatasetCollectionParams } from '@/global/core/api/datasetReq.d'; +import type { SearchTestProps, SearchTestResponse } from '@/global/core/dataset/api.d'; import type { PushDatasetDataProps, UpdateDatasetDataProps, @@ -21,8 +21,6 @@ import type { SearchDataResponseItemType } from '@fastgpt/global/core/dataset/type'; import { DatasetTypeEnum } from '@fastgpt/global/core/dataset/constant'; -import { getToken } from '@/web/support/user/auth'; -import download from 'downloadjs'; import type { DatasetDataItemType } from '@fastgpt/global/core/dataset/type'; import type { DatasetCollectionsListItemType } from '@/global/core/dataset/type.d'; import { PagingData } from '@/types'; @@ -53,7 +51,7 @@ export const getCheckExportLimit = (datasetId: string) => /* =========== search test ============ */ export const postSearchText = (data: SearchTestProps) => - POST(`/core/dataset/searchTest`, data); + POST(`/core/dataset/searchTest`, data); /* ============================= collections ==================================== */ export const getDatasetCollections = (data: GetDatasetCollectionsProps) => diff --git a/projects/app/src/web/core/dataset/store/searchTest.ts b/projects/app/src/web/core/dataset/store/searchTest.ts index 2a05c9181..07297385f 100644 --- a/projects/app/src/web/core/dataset/store/searchTest.ts +++ b/projects/app/src/web/core/dataset/store/searchTest.ts @@ -8,6 +8,7 @@ export type SearchTestStoreItemType = { datasetId: string; text: string; time: Date; + duration: string; results: SearchDataResponseItemType[]; };