feat: chunk index independent config (#4271)
* sync collection * remove lock * feat: chunk index independent config * feat: add max chunksize to split chunk function * remove log * update doc * remove * remove log
This commit is contained in:
parent
222ff0d49a
commit
e812ad6e84
@ -11,8 +11,6 @@ weight: 853
|
|||||||
| --------------------- | --------------------- |
|
| --------------------- | --------------------- |
|
||||||
|  |  |
|
|  |  |
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## 创建训练订单
|
## 创建训练订单
|
||||||
|
|
||||||
{{< tabs tabTotal="2" >}}
|
{{< tabs tabTotal="2" >}}
|
||||||
@ -289,7 +287,7 @@ curl --location --request DELETE 'http://localhost:3000/api/core/dataset/delete?
|
|||||||
|
|
||||||
## 集合
|
## 集合
|
||||||
|
|
||||||
### 通用创建参数说明
|
### 通用创建参数说明(必看)
|
||||||
|
|
||||||
**入参**
|
**入参**
|
||||||
|
|
||||||
@ -300,8 +298,11 @@ curl --location --request DELETE 'http://localhost:3000/api/core/dataset/delete?
|
|||||||
| trainingType | 数据处理方式。chunk: 按文本长度进行分割;qa: 问答对提取 | ✅ |
|
| trainingType | 数据处理方式。chunk: 按文本长度进行分割;qa: 问答对提取 | ✅ |
|
||||||
| autoIndexes | 是否自动生成索引(仅商业版支持) | |
|
| autoIndexes | 是否自动生成索引(仅商业版支持) | |
|
||||||
| imageIndex | 是否自动生成图片索引(仅商业版支持) | |
|
| imageIndex | 是否自动生成图片索引(仅商业版支持) | |
|
||||||
| chunkSize | 预估块大小 | |
|
| chunkSettingMode | 分块参数模式。auto: 系统默认参数; custom: 手动指定参数 | |
|
||||||
| chunkSplitter | 自定义最高优先分割符号 | |
|
| chunkSplitMode | 分块拆分模式。size: 按长度拆分; char: 按字符拆分。chunkSettingMode=auto时不生效。 | |
|
||||||
|
| chunkSize | 分块大小,默认 1500。chunkSettingMode=auto时不生效。 | |
|
||||||
|
| indexSize | 索引大小,默认 512,必须小于索引模型最大token。chunkSettingMode=auto时不生效。 | |
|
||||||
|
| chunkSplitter | 自定义最高优先分割符号,除非超出文件处理最大上下文,否则不会进行进一步拆分。chunkSettingMode=auto时不生效。 | |
|
||||||
| qaPrompt | qa拆分提示词 | |
|
| qaPrompt | qa拆分提示词 | |
|
||||||
| tags | 集合标签(字符串数组) | |
|
| tags | 集合标签(字符串数组) | |
|
||||||
| createTime | 文件创建时间(Date / String) | |
|
| createTime | 文件创建时间(Date / String) | |
|
||||||
@ -389,9 +390,8 @@ curl --location --request POST 'http://localhost:3000/api/core/dataset/collectio
|
|||||||
"name":"测试训练",
|
"name":"测试训练",
|
||||||
|
|
||||||
"trainingType": "qa",
|
"trainingType": "qa",
|
||||||
"chunkSize":8000,
|
"chunkSettingMode": "auto",
|
||||||
"chunkSplitter":"",
|
"qaPrompt":"",
|
||||||
"qaPrompt":"11",
|
|
||||||
|
|
||||||
"metadata":{}
|
"metadata":{}
|
||||||
}'
|
}'
|
||||||
@ -409,10 +409,6 @@ curl --location --request POST 'http://localhost:3000/api/core/dataset/collectio
|
|||||||
- parentId: 父级ID,不填则默认为根目录
|
- parentId: 父级ID,不填则默认为根目录
|
||||||
- name: 集合名称(必填)
|
- name: 集合名称(必填)
|
||||||
- metadata: 元数据(暂时没啥用)
|
- metadata: 元数据(暂时没啥用)
|
||||||
- trainingType: 训练模式(必填)
|
|
||||||
- chunkSize: 每个 chunk 的长度(可选). chunk模式:100~3000; qa模式: 4000~模型最大token(16k模型通常建议不超过10000)
|
|
||||||
- chunkSplitter: 自定义最高优先分割符号(可选)
|
|
||||||
- qaPrompt: qa拆分自定义提示词(可选)
|
|
||||||
{{% /alert %}}
|
{{% /alert %}}
|
||||||
|
|
||||||
{{< /markdownify >}}
|
{{< /markdownify >}}
|
||||||
@ -462,8 +458,7 @@ curl --location --request POST 'http://localhost:3000/api/core/dataset/collectio
|
|||||||
"parentId": null,
|
"parentId": null,
|
||||||
|
|
||||||
"trainingType": "chunk",
|
"trainingType": "chunk",
|
||||||
"chunkSize":512,
|
"chunkSettingMode": "auto",
|
||||||
"chunkSplitter":"",
|
|
||||||
"qaPrompt":"",
|
"qaPrompt":"",
|
||||||
|
|
||||||
"metadata":{
|
"metadata":{
|
||||||
@ -483,10 +478,6 @@ curl --location --request POST 'http://localhost:3000/api/core/dataset/collectio
|
|||||||
- datasetId: 知识库的ID(必填)
|
- datasetId: 知识库的ID(必填)
|
||||||
- parentId: 父级ID,不填则默认为根目录
|
- parentId: 父级ID,不填则默认为根目录
|
||||||
- metadata.webPageSelector: 网页选择器,用于指定网页中的哪个元素作为文本(可选)
|
- metadata.webPageSelector: 网页选择器,用于指定网页中的哪个元素作为文本(可选)
|
||||||
- trainingType:训练模式(必填)
|
|
||||||
- chunkSize: 每个 chunk 的长度(可选). chunk模式:100~3000; qa模式: 4000~模型最大token(16k模型通常建议不超过10000)
|
|
||||||
- chunkSplitter: 自定义最高优先分割符号(可选)
|
|
||||||
- qaPrompt: qa拆分自定义提示词(可选)
|
|
||||||
{{% /alert %}}
|
{{% /alert %}}
|
||||||
|
|
||||||
{{< /markdownify >}}
|
{{< /markdownify >}}
|
||||||
@ -545,13 +536,7 @@ curl --location --request POST 'http://localhost:3000/api/core/dataset/collectio
|
|||||||
|
|
||||||
{{% alert icon=" " context="success" %}}
|
{{% alert icon=" " context="success" %}}
|
||||||
- file: 文件
|
- file: 文件
|
||||||
- data: 知识库相关信息(json序列化后传入)
|
- data: 知识库相关信息(json序列化后传入),参数说明见上方“通用创建参数说明”
|
||||||
- datasetId: 知识库的ID(必填)
|
|
||||||
- parentId: 父级ID,不填则默认为根目录
|
|
||||||
- trainingType:训练模式(必填)
|
|
||||||
- chunkSize: 每个 chunk 的长度(可选). chunk模式:100~3000; qa模式: 4000~模型最大token(16k模型通常建议不超过10000)
|
|
||||||
- chunkSplitter: 自定义最高优先分割符号(可选)
|
|
||||||
- qaPrompt: qa拆分自定义提示词(可选)
|
|
||||||
{{% /alert %}}
|
{{% /alert %}}
|
||||||
|
|
||||||
{{< /markdownify >}}
|
{{< /markdownify >}}
|
||||||
|
|||||||
@ -7,12 +7,17 @@ toc: true
|
|||||||
weight: 799
|
weight: 799
|
||||||
---
|
---
|
||||||
|
|
||||||
|
## 重要提示
|
||||||
|
|
||||||
|
- 知识库导入数据 API 变更,增加`chunkSettingMode`,`chunkSplitMode`,`indexSize`可选参数,具体可参考 [知识库导入数据 API](/docs/development/openapi/dataset) 文档。
|
||||||
|
|
||||||
|
|
||||||
## 🚀 新增内容
|
## 🚀 新增内容
|
||||||
|
|
||||||
1. 知识库分块增加自定义分隔符预设值,同时支持自定义换行符分割。
|
1. 知识库分块优化:支持单独配置分块大小和索引大小,允许进行超大分块,以更大的输入 Tokens 换取完整分块。
|
||||||
2. 外部变量改名:自定义变量。 并且支持在测试时调试,在分享链接中,该变量直接隐藏。
|
2. 知识库分块增加自定义分隔符预设值,同时支持自定义换行符分割。
|
||||||
3. 集合同步时,支持同步修改标题。
|
3. 外部变量改名:自定义变量。 并且支持在测试时调试,在分享链接中,该变量直接隐藏。
|
||||||
|
4. 集合同步时,支持同步修改标题。
|
||||||
|
|
||||||
## ⚙️ 优化
|
## ⚙️ 优化
|
||||||
|
|
||||||
|
|||||||
@ -1,15 +1,17 @@
|
|||||||
|
import { defaultMaxChunkSize } from '../../core/dataset/training/utils';
|
||||||
import { getErrText } from '../error/utils';
|
import { getErrText } from '../error/utils';
|
||||||
|
|
||||||
export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----';
|
export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----';
|
||||||
|
|
||||||
type SplitProps = {
|
type SplitProps = {
|
||||||
text: string;
|
text: string;
|
||||||
chunkLen: number;
|
chunkSize: number;
|
||||||
|
maxSize?: number;
|
||||||
overlapRatio?: number;
|
overlapRatio?: number;
|
||||||
customReg?: string[];
|
customReg?: string[];
|
||||||
};
|
};
|
||||||
export type TextSplitProps = Omit<SplitProps, 'text' | 'chunkLen'> & {
|
export type TextSplitProps = Omit<SplitProps, 'text' | 'chunkSize'> & {
|
||||||
chunkLen?: number;
|
chunkSize?: number;
|
||||||
};
|
};
|
||||||
|
|
||||||
type SplitResponse = {
|
type SplitResponse = {
|
||||||
@ -55,7 +57,7 @@ const strIsMdTable = (str: string) => {
|
|||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
const markdownTableSplit = (props: SplitProps): SplitResponse => {
|
const markdownTableSplit = (props: SplitProps): SplitResponse => {
|
||||||
let { text = '', chunkLen } = props;
|
let { text = '', chunkSize } = props;
|
||||||
const splitText2Lines = text.split('\n');
|
const splitText2Lines = text.split('\n');
|
||||||
const header = splitText2Lines[0];
|
const header = splitText2Lines[0];
|
||||||
const headerSize = header.split('|').length - 2;
|
const headerSize = header.split('|').length - 2;
|
||||||
@ -71,7 +73,7 @@ ${mdSplitString}
|
|||||||
`;
|
`;
|
||||||
|
|
||||||
for (let i = 2; i < splitText2Lines.length; i++) {
|
for (let i = 2; i < splitText2Lines.length; i++) {
|
||||||
if (chunk.length + splitText2Lines[i].length > chunkLen * 1.2) {
|
if (chunk.length + splitText2Lines[i].length > chunkSize * 1.2) {
|
||||||
chunks.push(chunk);
|
chunks.push(chunk);
|
||||||
chunk = `${header}
|
chunk = `${header}
|
||||||
${mdSplitString}
|
${mdSplitString}
|
||||||
@ -98,11 +100,17 @@ ${mdSplitString}
|
|||||||
5. 标点分割:重叠
|
5. 标点分割:重叠
|
||||||
*/
|
*/
|
||||||
const commonSplit = (props: SplitProps): SplitResponse => {
|
const commonSplit = (props: SplitProps): SplitResponse => {
|
||||||
let { text = '', chunkLen, overlapRatio = 0.15, customReg = [] } = props;
|
let {
|
||||||
|
text = '',
|
||||||
|
chunkSize,
|
||||||
|
maxSize = defaultMaxChunkSize,
|
||||||
|
overlapRatio = 0.15,
|
||||||
|
customReg = []
|
||||||
|
} = props;
|
||||||
|
|
||||||
const splitMarker = 'SPLIT_HERE_SPLIT_HERE';
|
const splitMarker = 'SPLIT_HERE_SPLIT_HERE';
|
||||||
const codeBlockMarker = 'CODE_BLOCK_LINE_MARKER';
|
const codeBlockMarker = 'CODE_BLOCK_LINE_MARKER';
|
||||||
const overlapLen = Math.round(chunkLen * overlapRatio);
|
const overlapLen = Math.round(chunkSize * overlapRatio);
|
||||||
|
|
||||||
// replace code block all \n to codeBlockMarker
|
// replace code block all \n to codeBlockMarker
|
||||||
text = text.replace(/(```[\s\S]*?```|~~~[\s\S]*?~~~)/g, function (match) {
|
text = text.replace(/(```[\s\S]*?```|~~~[\s\S]*?~~~)/g, function (match) {
|
||||||
@ -118,24 +126,24 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
const stepReges: { reg: RegExp | string; maxLen: number }[] = [
|
const stepReges: { reg: RegExp | string; maxLen: number }[] = [
|
||||||
...customReg.map((text) => ({
|
...customReg.map((text) => ({
|
||||||
reg: text.replaceAll('\\n', '\n'),
|
reg: text.replaceAll('\\n', '\n'),
|
||||||
maxLen: chunkLen * 1.4
|
maxLen: chunkSize
|
||||||
})),
|
})),
|
||||||
{ reg: /^(#\s[^\n]+\n)/gm, maxLen: chunkLen * 1.2 },
|
{ reg: /^(#\s[^\n]+\n)/gm, maxLen: chunkSize },
|
||||||
{ reg: /^(##\s[^\n]+\n)/gm, maxLen: chunkLen * 1.4 },
|
{ reg: /^(##\s[^\n]+\n)/gm, maxLen: chunkSize },
|
||||||
{ reg: /^(###\s[^\n]+\n)/gm, maxLen: chunkLen * 1.6 },
|
{ reg: /^(###\s[^\n]+\n)/gm, maxLen: chunkSize },
|
||||||
{ reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkLen * 1.8 },
|
{ reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkSize },
|
||||||
{ reg: /^(#####\s[^\n]+\n)/gm, maxLen: chunkLen * 1.8 },
|
{ reg: /^(#####\s[^\n]+\n)/gm, maxLen: chunkSize },
|
||||||
|
|
||||||
{ reg: /([\n]([`~]))/g, maxLen: chunkLen * 4 }, // code block
|
{ reg: /([\n]([`~]))/g, maxLen: chunkSize }, // code block
|
||||||
{ reg: /([\n](?=\s*[0-9]+\.))/g, maxLen: chunkLen * 2 }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char
|
{ reg: /([\n](?=\s*[0-9]+\.))/g, maxLen: chunkSize }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char
|
||||||
{ reg: /(\n{2,})/g, maxLen: chunkLen * 1.6 },
|
{ reg: /(\n{2,})/g, maxLen: chunkSize },
|
||||||
{ reg: /([\n])/g, maxLen: chunkLen * 1.2 },
|
{ reg: /([\n])/g, maxLen: chunkSize },
|
||||||
// ------ There's no overlap on the top
|
// ------ There's no overlap on the top
|
||||||
{ reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.2 },
|
{ reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkSize },
|
||||||
{ reg: /([!]|!\s)/g, maxLen: chunkLen * 1.2 },
|
{ reg: /([!]|!\s)/g, maxLen: chunkSize },
|
||||||
{ reg: /([?]|\?\s)/g, maxLen: chunkLen * 1.4 },
|
{ reg: /([?]|\?\s)/g, maxLen: chunkSize },
|
||||||
{ reg: /([;]|;\s)/g, maxLen: chunkLen * 1.6 },
|
{ reg: /([;]|;\s)/g, maxLen: chunkSize },
|
||||||
{ reg: /([,]|,\s)/g, maxLen: chunkLen * 2 }
|
{ reg: /([,]|,\s)/g, maxLen: chunkSize }
|
||||||
];
|
];
|
||||||
|
|
||||||
const customRegLen = customReg.length;
|
const customRegLen = customReg.length;
|
||||||
@ -203,7 +211,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
/* Gets the overlap at the end of a text as the beginning of the next block */
|
/* Gets the overlap at the end of a text as the beginning of the next block */
|
||||||
const getOneTextOverlapText = ({ text, step }: { text: string; step: number }): string => {
|
const getOneTextOverlapText = ({ text, step }: { text: string; step: number }): string => {
|
||||||
const forbidOverlap = checkForbidOverlap(step);
|
const forbidOverlap = checkForbidOverlap(step);
|
||||||
const maxOverlapLen = chunkLen * 0.4;
|
const maxOverlapLen = chunkSize * 0.4;
|
||||||
|
|
||||||
// step >= stepReges.length: Do not overlap incomplete sentences
|
// step >= stepReges.length: Do not overlap incomplete sentences
|
||||||
if (forbidOverlap || overlapLen === 0 || step >= stepReges.length) return '';
|
if (forbidOverlap || overlapLen === 0 || step >= stepReges.length) return '';
|
||||||
@ -246,13 +254,13 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
|
|
||||||
// oversize
|
// oversize
|
||||||
if (step >= stepReges.length) {
|
if (step >= stepReges.length) {
|
||||||
if (text.length < chunkLen * 3) {
|
if (text.length < chunkSize * 3) {
|
||||||
return [text];
|
return [text];
|
||||||
}
|
}
|
||||||
// use slice-chunkLen to split text
|
// use slice-chunkSize to split text
|
||||||
const chunks: string[] = [];
|
const chunks: string[] = [];
|
||||||
for (let i = 0; i < text.length; i += chunkLen - overlapLen) {
|
for (let i = 0; i < text.length; i += chunkSize - overlapLen) {
|
||||||
chunks.push(text.slice(i, i + chunkLen));
|
chunks.push(text.slice(i, i + chunkSize));
|
||||||
}
|
}
|
||||||
return chunks;
|
return chunks;
|
||||||
}
|
}
|
||||||
@ -260,8 +268,8 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
// split text by special char
|
// split text by special char
|
||||||
const splitTexts = getSplitTexts({ text, step });
|
const splitTexts = getSplitTexts({ text, step });
|
||||||
|
|
||||||
const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkLen;
|
const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkSize;
|
||||||
const minChunkLen = chunkLen * 0.7;
|
const minChunkLen = chunkSize * 0.7;
|
||||||
|
|
||||||
const chunks: string[] = [];
|
const chunks: string[] = [];
|
||||||
for (let i = 0; i < splitTexts.length; i++) {
|
for (let i = 0; i < splitTexts.length; i++) {
|
||||||
@ -297,7 +305,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// newText is too large(now, The lastText must be smaller than chunkLen)
|
// newText is too large(now, The lastText must be smaller than chunkSize)
|
||||||
if (newTextLen > maxLen) {
|
if (newTextLen > maxLen) {
|
||||||
// lastText greater minChunkLen, direct push it to chunks, not add to next chunk. (large lastText)
|
// lastText greater minChunkLen, direct push it to chunks, not add to next chunk. (large lastText)
|
||||||
if (lastTextLen > minChunkLen) {
|
if (lastTextLen > minChunkLen) {
|
||||||
@ -352,7 +360,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
|
|
||||||
/* If the last chunk is independent, it needs to be push chunks. */
|
/* If the last chunk is independent, it needs to be push chunks. */
|
||||||
if (lastText && chunks[chunks.length - 1] && !chunks[chunks.length - 1].endsWith(lastText)) {
|
if (lastText && chunks[chunks.length - 1] && !chunks[chunks.length - 1].endsWith(lastText)) {
|
||||||
if (lastText.length < chunkLen * 0.4) {
|
if (lastText.length < chunkSize * 0.4) {
|
||||||
chunks[chunks.length - 1] = chunks[chunks.length - 1] + lastText;
|
chunks[chunks.length - 1] = chunks[chunks.length - 1] + lastText;
|
||||||
} else {
|
} else {
|
||||||
chunks.push(lastText);
|
chunks.push(lastText);
|
||||||
@ -386,9 +394,9 @@ const commonSplit = (props: SplitProps): SplitResponse => {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* text split into chunks
|
* text split into chunks
|
||||||
* chunkLen - one chunk len. max: 3500
|
* chunkSize - one chunk len. max: 3500
|
||||||
* overlapLen - The size of the before and after Text
|
* overlapLen - The size of the before and after Text
|
||||||
* chunkLen > overlapLen
|
* chunkSize > overlapLen
|
||||||
* markdown
|
* markdown
|
||||||
*/
|
*/
|
||||||
export const splitText2Chunks = (props: SplitProps): SplitResponse => {
|
export const splitText2Chunks = (props: SplitProps): SplitResponse => {
|
||||||
|
|||||||
13
packages/global/core/dataset/api.d.ts
vendored
13
packages/global/core/dataset/api.d.ts
vendored
@ -1,5 +1,10 @@
|
|||||||
import { DatasetDataIndexItemType, DatasetSchemaType } from './type';
|
import { DatasetDataIndexItemType, DatasetSchemaType } from './type';
|
||||||
import { DatasetCollectionTypeEnum, DatasetCollectionDataProcessModeEnum } from './constants';
|
import {
|
||||||
|
DatasetCollectionTypeEnum,
|
||||||
|
DatasetCollectionDataProcessModeEnum,
|
||||||
|
ChunkSettingModeEnum,
|
||||||
|
DataChunkSplitModeEnum
|
||||||
|
} from './constants';
|
||||||
import type { LLMModelItemType } from '../ai/model.d';
|
import type { LLMModelItemType } from '../ai/model.d';
|
||||||
import { ParentIdType } from 'common/parentFolder/type';
|
import { ParentIdType } from 'common/parentFolder/type';
|
||||||
|
|
||||||
@ -33,7 +38,13 @@ export type DatasetCollectionChunkMetadataType = {
|
|||||||
trainingType?: DatasetCollectionDataProcessModeEnum;
|
trainingType?: DatasetCollectionDataProcessModeEnum;
|
||||||
imageIndex?: boolean;
|
imageIndex?: boolean;
|
||||||
autoIndexes?: boolean;
|
autoIndexes?: boolean;
|
||||||
|
|
||||||
|
chunkSettingMode?: ChunkSettingModeEnum;
|
||||||
|
chunkSplitMode?: DataChunkSplitModeEnum;
|
||||||
|
|
||||||
chunkSize?: number;
|
chunkSize?: number;
|
||||||
|
indexSize?: number;
|
||||||
|
|
||||||
chunkSplitter?: string;
|
chunkSplitter?: string;
|
||||||
qaPrompt?: string;
|
qaPrompt?: string;
|
||||||
metadata?: Record<string, any>;
|
metadata?: Record<string, any>;
|
||||||
|
|||||||
@ -129,6 +129,16 @@ export const DatasetCollectionDataProcessModeMap = {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export enum ChunkSettingModeEnum {
|
||||||
|
auto = 'auto',
|
||||||
|
custom = 'custom'
|
||||||
|
}
|
||||||
|
|
||||||
|
export enum DataChunkSplitModeEnum {
|
||||||
|
size = 'size',
|
||||||
|
char = 'char'
|
||||||
|
}
|
||||||
|
|
||||||
/* ------------ data -------------- */
|
/* ------------ data -------------- */
|
||||||
|
|
||||||
/* ------------ training -------------- */
|
/* ------------ training -------------- */
|
||||||
|
|||||||
1
packages/global/core/dataset/controller.d.ts
vendored
1
packages/global/core/dataset/controller.d.ts
vendored
@ -13,6 +13,7 @@ export type CreateDatasetDataProps = {
|
|||||||
|
|
||||||
export type UpdateDatasetDataProps = {
|
export type UpdateDatasetDataProps = {
|
||||||
dataId: string;
|
dataId: string;
|
||||||
|
|
||||||
q?: string;
|
q?: string;
|
||||||
a?: string;
|
a?: string;
|
||||||
indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & {
|
indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & {
|
||||||
|
|||||||
@ -15,6 +15,8 @@ export type PushDataToTrainingQueueProps = {
|
|||||||
vectorModel: string;
|
vectorModel: string;
|
||||||
vlmModel?: string;
|
vlmModel?: string;
|
||||||
|
|
||||||
|
indexSize?: number;
|
||||||
|
|
||||||
billId?: string;
|
billId?: string;
|
||||||
session?: ClientSession;
|
session?: ClientSession;
|
||||||
};
|
};
|
||||||
|
|||||||
136
packages/global/core/dataset/training/utils.ts
Normal file
136
packages/global/core/dataset/training/utils.ts
Normal file
@ -0,0 +1,136 @@
|
|||||||
|
import { EmbeddingModelItemType, LLMModelItemType } from '../../../core/ai/model.d';
|
||||||
|
import {
|
||||||
|
ChunkSettingModeEnum,
|
||||||
|
DataChunkSplitModeEnum,
|
||||||
|
DatasetCollectionDataProcessModeEnum
|
||||||
|
} from '../constants';
|
||||||
|
|
||||||
|
export const minChunkSize = 64; // min index and chunk size
|
||||||
|
|
||||||
|
// Chunk size
|
||||||
|
export const chunkAutoChunkSize = 1500;
|
||||||
|
export const getMaxChunkSize = (model: LLMModelItemType) => {
|
||||||
|
return Math.max(model.maxContext - model.maxResponse, 2000);
|
||||||
|
};
|
||||||
|
|
||||||
|
// QA
|
||||||
|
export const defaultMaxChunkSize = 8000;
|
||||||
|
export const getLLMDefaultChunkSize = (model?: LLMModelItemType) => {
|
||||||
|
if (!model) return defaultMaxChunkSize;
|
||||||
|
return Math.max(Math.min(model.maxContext - model.maxResponse, defaultMaxChunkSize), 2000);
|
||||||
|
};
|
||||||
|
|
||||||
|
export const getLLMMaxChunkSize = (model?: LLMModelItemType) => {
|
||||||
|
if (!model) return 8000;
|
||||||
|
return Math.max(model.maxContext - model.maxResponse, 2000);
|
||||||
|
};
|
||||||
|
|
||||||
|
// Index size
|
||||||
|
export const getMaxIndexSize = (model?: EmbeddingModelItemType) => {
|
||||||
|
return model?.maxToken || 512;
|
||||||
|
};
|
||||||
|
export const getAutoIndexSize = (model?: EmbeddingModelItemType) => {
|
||||||
|
return model?.defaultToken || 512;
|
||||||
|
};
|
||||||
|
|
||||||
|
const indexSizeSelectList = [
|
||||||
|
{
|
||||||
|
label: '64',
|
||||||
|
value: 64
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: '128',
|
||||||
|
value: 128
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: '256',
|
||||||
|
value: 256
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: '512',
|
||||||
|
value: 512
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: '768',
|
||||||
|
value: 768
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: '1024',
|
||||||
|
value: 1024
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: '1536',
|
||||||
|
value: 1536
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: '2048',
|
||||||
|
value: 2048
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: '3072',
|
||||||
|
value: 3072
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: '4096',
|
||||||
|
value: 4096
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: '5120',
|
||||||
|
value: 5120
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: '6144',
|
||||||
|
value: 6144
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: '7168',
|
||||||
|
value: 7168
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: '8192',
|
||||||
|
value: 8192
|
||||||
|
}
|
||||||
|
];
|
||||||
|
export const getIndexSizeSelectList = (max = 512) => {
|
||||||
|
return indexSizeSelectList.filter((item) => item.value <= max);
|
||||||
|
};
|
||||||
|
|
||||||
|
// Compute
|
||||||
|
export const computeChunkSize = (params: {
|
||||||
|
trainingType: DatasetCollectionDataProcessModeEnum;
|
||||||
|
chunkSettingMode?: ChunkSettingModeEnum;
|
||||||
|
chunkSplitMode?: DataChunkSplitModeEnum;
|
||||||
|
llmModel?: LLMModelItemType;
|
||||||
|
chunkSize?: number;
|
||||||
|
}) => {
|
||||||
|
if (params.trainingType === DatasetCollectionDataProcessModeEnum.qa) {
|
||||||
|
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
|
||||||
|
return getLLMDefaultChunkSize(params.llmModel);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// chunk
|
||||||
|
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
|
||||||
|
return chunkAutoChunkSize;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.chunkSplitMode === DataChunkSplitModeEnum.char) {
|
||||||
|
return getLLMMaxChunkSize(params.llmModel);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Math.min(params.chunkSize || chunkAutoChunkSize, getLLMMaxChunkSize(params.llmModel));
|
||||||
|
};
|
||||||
|
|
||||||
|
export const computeChunkSplitter = (params: {
|
||||||
|
chunkSettingMode?: ChunkSettingModeEnum;
|
||||||
|
chunkSplitMode?: DataChunkSplitModeEnum;
|
||||||
|
chunkSplitter?: string;
|
||||||
|
}) => {
|
||||||
|
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
if (params.chunkSplitMode === DataChunkSplitModeEnum.size) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
return params.chunkSplitter;
|
||||||
|
};
|
||||||
9
packages/global/core/dataset/type.d.ts
vendored
9
packages/global/core/dataset/type.d.ts
vendored
@ -2,6 +2,7 @@ import type { LLMModelItemType, EmbeddingModelItemType } from '../../core/ai/mod
|
|||||||
import { PermissionTypeEnum } from '../../support/permission/constant';
|
import { PermissionTypeEnum } from '../../support/permission/constant';
|
||||||
import { PushDatasetDataChunkProps } from './api';
|
import { PushDatasetDataChunkProps } from './api';
|
||||||
import {
|
import {
|
||||||
|
DataChunkSplitModeEnum,
|
||||||
DatasetCollectionDataProcessModeEnum,
|
DatasetCollectionDataProcessModeEnum,
|
||||||
DatasetCollectionTypeEnum,
|
DatasetCollectionTypeEnum,
|
||||||
DatasetStatusEnum,
|
DatasetStatusEnum,
|
||||||
@ -14,6 +15,7 @@ import { Permission } from '../../support/permission/controller';
|
|||||||
import { APIFileServer, FeishuServer, YuqueServer } from './apiDataset';
|
import { APIFileServer, FeishuServer, YuqueServer } from './apiDataset';
|
||||||
import { SourceMemberType } from 'support/user/type';
|
import { SourceMemberType } from 'support/user/type';
|
||||||
import { DatasetDataIndexTypeEnum } from './data/constants';
|
import { DatasetDataIndexTypeEnum } from './data/constants';
|
||||||
|
import { ChunkSettingModeEnum } from './constants';
|
||||||
|
|
||||||
export type DatasetSchemaType = {
|
export type DatasetSchemaType = {
|
||||||
_id: string;
|
_id: string;
|
||||||
@ -88,7 +90,12 @@ export type DatasetCollectionSchemaType = {
|
|||||||
autoIndexes?: boolean;
|
autoIndexes?: boolean;
|
||||||
imageIndex?: boolean;
|
imageIndex?: boolean;
|
||||||
trainingType: DatasetCollectionDataProcessModeEnum;
|
trainingType: DatasetCollectionDataProcessModeEnum;
|
||||||
chunkSize: number;
|
|
||||||
|
chunkSettingMode?: ChunkSettingModeEnum;
|
||||||
|
chunkSplitMode?: DataChunkSplitModeEnum;
|
||||||
|
|
||||||
|
chunkSize?: number;
|
||||||
|
indexSize?: number;
|
||||||
chunkSplitter?: string;
|
chunkSplitter?: string;
|
||||||
qaPrompt?: string;
|
qaPrompt?: string;
|
||||||
};
|
};
|
||||||
|
|||||||
@ -1,7 +1,6 @@
|
|||||||
import { TrainingModeEnum, DatasetCollectionTypeEnum } from './constants';
|
import { TrainingModeEnum, DatasetCollectionTypeEnum } from './constants';
|
||||||
import { getFileIcon } from '../../common/file/icon';
|
import { getFileIcon } from '../../common/file/icon';
|
||||||
import { strIsLink } from '../../common/string/tools';
|
import { strIsLink } from '../../common/string/tools';
|
||||||
import { DatasetDataIndexTypeEnum } from './data/constants';
|
|
||||||
|
|
||||||
export function getCollectionIcon(
|
export function getCollectionIcon(
|
||||||
type: DatasetCollectionTypeEnum = DatasetCollectionTypeEnum.file,
|
type: DatasetCollectionTypeEnum = DatasetCollectionTypeEnum.file,
|
||||||
@ -38,26 +37,6 @@ export function getSourceNameIcon({
|
|||||||
return 'file/fill/file';
|
return 'file/fill/file';
|
||||||
}
|
}
|
||||||
|
|
||||||
/* get dataset data default index */
|
|
||||||
export function getDefaultIndex(props?: { q?: string; a?: string }) {
|
|
||||||
const { q = '', a } = props || {};
|
|
||||||
|
|
||||||
return [
|
|
||||||
{
|
|
||||||
text: q,
|
|
||||||
type: DatasetDataIndexTypeEnum.default
|
|
||||||
},
|
|
||||||
...(a
|
|
||||||
? [
|
|
||||||
{
|
|
||||||
text: a,
|
|
||||||
type: DatasetDataIndexTypeEnum.default
|
|
||||||
}
|
|
||||||
]
|
|
||||||
: [])
|
|
||||||
];
|
|
||||||
}
|
|
||||||
|
|
||||||
export const predictDataLimitLength = (mode: TrainingModeEnum, data: any[]) => {
|
export const predictDataLimitLength = (mode: TrainingModeEnum, data: any[]) => {
|
||||||
if (mode === TrainingModeEnum.qa) return data.length * 20;
|
if (mode === TrainingModeEnum.qa) return data.length * 20;
|
||||||
if (mode === TrainingModeEnum.auto) return data.length * 5;
|
if (mode === TrainingModeEnum.auto) return data.length * 5;
|
||||||
|
|||||||
@ -27,6 +27,11 @@ import { addDays } from 'date-fns';
|
|||||||
import { MongoDatasetDataText } from '../data/dataTextSchema';
|
import { MongoDatasetDataText } from '../data/dataTextSchema';
|
||||||
import { retryFn } from '@fastgpt/global/common/system/utils';
|
import { retryFn } from '@fastgpt/global/common/system/utils';
|
||||||
import { getTrainingModeByCollection } from './utils';
|
import { getTrainingModeByCollection } from './utils';
|
||||||
|
import {
|
||||||
|
computeChunkSize,
|
||||||
|
computeChunkSplitter,
|
||||||
|
getLLMMaxChunkSize
|
||||||
|
} from '@fastgpt/global/core/dataset/training/utils';
|
||||||
|
|
||||||
export const createCollectionAndInsertData = async ({
|
export const createCollectionAndInsertData = async ({
|
||||||
dataset,
|
dataset,
|
||||||
@ -54,18 +59,22 @@ export const createCollectionAndInsertData = async ({
|
|||||||
|
|
||||||
const teamId = createCollectionParams.teamId;
|
const teamId = createCollectionParams.teamId;
|
||||||
const tmbId = createCollectionParams.tmbId;
|
const tmbId = createCollectionParams.tmbId;
|
||||||
// Chunk split params
|
|
||||||
|
// Set default params
|
||||||
const trainingType =
|
const trainingType =
|
||||||
createCollectionParams.trainingType || DatasetCollectionDataProcessModeEnum.chunk;
|
createCollectionParams.trainingType || DatasetCollectionDataProcessModeEnum.chunk;
|
||||||
const chunkSize = createCollectionParams.chunkSize || 512;
|
const chunkSize = computeChunkSize({
|
||||||
const chunkSplitter = createCollectionParams.chunkSplitter;
|
...createCollectionParams,
|
||||||
const qaPrompt = createCollectionParams.qaPrompt;
|
trainingType,
|
||||||
const usageName = createCollectionParams.name;
|
llmModel: getLLMModel(dataset.agentModel)
|
||||||
|
});
|
||||||
|
const chunkSplitter = computeChunkSplitter(createCollectionParams);
|
||||||
|
|
||||||
// 1. split chunks
|
// 1. split chunks
|
||||||
const chunks = rawText2Chunks({
|
const chunks = rawText2Chunks({
|
||||||
rawText,
|
rawText,
|
||||||
chunkLen: chunkSize,
|
chunkSize,
|
||||||
|
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
|
||||||
overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
|
overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
|
||||||
customReg: chunkSplitter ? [chunkSplitter] : [],
|
customReg: chunkSplitter ? [chunkSplitter] : [],
|
||||||
isQAImport
|
isQAImport
|
||||||
@ -76,7 +85,7 @@ export const createCollectionAndInsertData = async ({
|
|||||||
teamId,
|
teamId,
|
||||||
insertLen: predictDataLimitLength(
|
insertLen: predictDataLimitLength(
|
||||||
getTrainingModeByCollection({
|
getTrainingModeByCollection({
|
||||||
trainingType,
|
trainingType: trainingType,
|
||||||
autoIndexes: createCollectionParams.autoIndexes,
|
autoIndexes: createCollectionParams.autoIndexes,
|
||||||
imageIndex: createCollectionParams.imageIndex
|
imageIndex: createCollectionParams.imageIndex
|
||||||
}),
|
}),
|
||||||
@ -88,6 +97,9 @@ export const createCollectionAndInsertData = async ({
|
|||||||
// 3. create collection
|
// 3. create collection
|
||||||
const { _id: collectionId } = await createOneCollection({
|
const { _id: collectionId } = await createOneCollection({
|
||||||
...createCollectionParams,
|
...createCollectionParams,
|
||||||
|
trainingType,
|
||||||
|
chunkSize,
|
||||||
|
chunkSplitter,
|
||||||
|
|
||||||
hashRawText: hashStr(rawText),
|
hashRawText: hashStr(rawText),
|
||||||
rawTextLength: rawText.length,
|
rawTextLength: rawText.length,
|
||||||
@ -111,7 +123,7 @@ export const createCollectionAndInsertData = async ({
|
|||||||
const { billId: newBillId } = await createTrainingUsage({
|
const { billId: newBillId } = await createTrainingUsage({
|
||||||
teamId,
|
teamId,
|
||||||
tmbId,
|
tmbId,
|
||||||
appName: usageName,
|
appName: createCollectionParams.name,
|
||||||
billSource: UsageSourceEnum.training,
|
billSource: UsageSourceEnum.training,
|
||||||
vectorModel: getEmbeddingModel(dataset.vectorModel)?.name,
|
vectorModel: getEmbeddingModel(dataset.vectorModel)?.name,
|
||||||
agentModel: getLLMModel(dataset.agentModel)?.name,
|
agentModel: getLLMModel(dataset.agentModel)?.name,
|
||||||
@ -130,12 +142,13 @@ export const createCollectionAndInsertData = async ({
|
|||||||
agentModel: dataset.agentModel,
|
agentModel: dataset.agentModel,
|
||||||
vectorModel: dataset.vectorModel,
|
vectorModel: dataset.vectorModel,
|
||||||
vlmModel: dataset.vlmModel,
|
vlmModel: dataset.vlmModel,
|
||||||
|
indexSize: createCollectionParams.indexSize,
|
||||||
mode: getTrainingModeByCollection({
|
mode: getTrainingModeByCollection({
|
||||||
trainingType,
|
trainingType: trainingType,
|
||||||
autoIndexes: createCollectionParams.autoIndexes,
|
autoIndexes: createCollectionParams.autoIndexes,
|
||||||
imageIndex: createCollectionParams.imageIndex
|
imageIndex: createCollectionParams.imageIndex
|
||||||
}),
|
}),
|
||||||
prompt: qaPrompt,
|
prompt: createCollectionParams.qaPrompt,
|
||||||
billId: traingBillId,
|
billId: traingBillId,
|
||||||
data: chunks.map((item, index) => ({
|
data: chunks.map((item, index) => ({
|
||||||
...item,
|
...item,
|
||||||
@ -207,11 +220,14 @@ export async function createOneCollection({
|
|||||||
// Parse settings
|
// Parse settings
|
||||||
customPdfParse,
|
customPdfParse,
|
||||||
imageIndex,
|
imageIndex,
|
||||||
|
autoIndexes,
|
||||||
|
|
||||||
// Chunk settings
|
// Chunk settings
|
||||||
trainingType = DatasetCollectionDataProcessModeEnum.chunk,
|
trainingType,
|
||||||
autoIndexes,
|
chunkSettingMode,
|
||||||
chunkSize = 512,
|
chunkSplitMode,
|
||||||
|
chunkSize,
|
||||||
|
indexSize,
|
||||||
chunkSplitter,
|
chunkSplitter,
|
||||||
qaPrompt,
|
qaPrompt,
|
||||||
|
|
||||||
@ -249,11 +265,14 @@ export async function createOneCollection({
|
|||||||
// Parse settings
|
// Parse settings
|
||||||
customPdfParse,
|
customPdfParse,
|
||||||
imageIndex,
|
imageIndex,
|
||||||
|
autoIndexes,
|
||||||
|
|
||||||
// Chunk settings
|
// Chunk settings
|
||||||
trainingType,
|
trainingType,
|
||||||
autoIndexes,
|
chunkSettingMode,
|
||||||
|
chunkSplitMode,
|
||||||
chunkSize,
|
chunkSize,
|
||||||
|
indexSize,
|
||||||
chunkSplitter,
|
chunkSplitter,
|
||||||
qaPrompt
|
qaPrompt
|
||||||
}
|
}
|
||||||
|
|||||||
@ -3,7 +3,9 @@ const { Schema, model, models } = connectionMongo;
|
|||||||
import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d';
|
import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d';
|
||||||
import {
|
import {
|
||||||
DatasetCollectionTypeMap,
|
DatasetCollectionTypeMap,
|
||||||
DatasetCollectionDataProcessModeEnum
|
DatasetCollectionDataProcessModeEnum,
|
||||||
|
ChunkSettingModeEnum,
|
||||||
|
DataChunkSplitModeEnum
|
||||||
} from '@fastgpt/global/core/dataset/constants';
|
} from '@fastgpt/global/core/dataset/constants';
|
||||||
import { DatasetCollectionName } from '../schema';
|
import { DatasetCollectionName } from '../schema';
|
||||||
import {
|
import {
|
||||||
@ -94,11 +96,18 @@ const DatasetCollectionSchema = new Schema({
|
|||||||
type: String,
|
type: String,
|
||||||
enum: Object.values(DatasetCollectionDataProcessModeEnum)
|
enum: Object.values(DatasetCollectionDataProcessModeEnum)
|
||||||
},
|
},
|
||||||
chunkSize: {
|
chunkSettingMode: {
|
||||||
type: Number,
|
type: String,
|
||||||
required: true
|
enum: Object.values(ChunkSettingModeEnum)
|
||||||
},
|
},
|
||||||
|
chunkSplitMode: {
|
||||||
|
type: String,
|
||||||
|
enum: Object.values(DataChunkSplitModeEnum)
|
||||||
|
},
|
||||||
|
chunkSize: Number,
|
||||||
chunkSplitter: String,
|
chunkSplitter: String,
|
||||||
|
|
||||||
|
indexSize: Number,
|
||||||
qaPrompt: String
|
qaPrompt: String
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@ -185,7 +185,7 @@ export const readApiServerFileContent = async ({
|
|||||||
export const rawText2Chunks = ({
|
export const rawText2Chunks = ({
|
||||||
rawText,
|
rawText,
|
||||||
isQAImport,
|
isQAImport,
|
||||||
chunkLen = 512,
|
chunkSize = 512,
|
||||||
...splitProps
|
...splitProps
|
||||||
}: {
|
}: {
|
||||||
rawText: string;
|
rawText: string;
|
||||||
@ -198,7 +198,7 @@ export const rawText2Chunks = ({
|
|||||||
|
|
||||||
const { chunks } = splitText2Chunks({
|
const { chunks } = splitText2Chunks({
|
||||||
text: rawText,
|
text: rawText,
|
||||||
chunkLen,
|
chunkSize,
|
||||||
...splitProps
|
...splitProps
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@ -12,6 +12,10 @@ import { getCollectionWithDataset } from '../controller';
|
|||||||
import { mongoSessionRun } from '../../../common/mongo/sessionRun';
|
import { mongoSessionRun } from '../../../common/mongo/sessionRun';
|
||||||
import { PushDataToTrainingQueueProps } from '@fastgpt/global/core/dataset/training/type';
|
import { PushDataToTrainingQueueProps } from '@fastgpt/global/core/dataset/training/type';
|
||||||
import { i18nT } from '../../../../web/i18n/utils';
|
import { i18nT } from '../../../../web/i18n/utils';
|
||||||
|
import {
|
||||||
|
getLLMDefaultChunkSize,
|
||||||
|
getLLMMaxChunkSize
|
||||||
|
} from '../../../../global/core/dataset/training/utils';
|
||||||
|
|
||||||
export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> => {
|
export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> => {
|
||||||
try {
|
try {
|
||||||
@ -55,6 +59,7 @@ export async function pushDataListToTrainingQueue({
|
|||||||
prompt,
|
prompt,
|
||||||
billId,
|
billId,
|
||||||
mode = TrainingModeEnum.chunk,
|
mode = TrainingModeEnum.chunk,
|
||||||
|
indexSize,
|
||||||
session
|
session
|
||||||
}: PushDataToTrainingQueueProps): Promise<PushDatasetDataResponse> {
|
}: PushDataToTrainingQueueProps): Promise<PushDatasetDataResponse> {
|
||||||
const getImageChunkMode = (data: PushDatasetDataChunkProps, mode: TrainingModeEnum) => {
|
const getImageChunkMode = (data: PushDatasetDataChunkProps, mode: TrainingModeEnum) => {
|
||||||
@ -68,38 +73,41 @@ export async function pushDataListToTrainingQueue({
|
|||||||
}
|
}
|
||||||
return mode;
|
return mode;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const vectorModelData = getEmbeddingModel(vectorModel);
|
||||||
|
if (!vectorModelData) {
|
||||||
|
return Promise.reject(i18nT('common:error_embedding_not_config'));
|
||||||
|
}
|
||||||
|
const agentModelData = getLLMModel(agentModel);
|
||||||
|
if (!agentModelData) {
|
||||||
|
return Promise.reject(i18nT('common:error_llm_not_config'));
|
||||||
|
}
|
||||||
|
if (mode === TrainingModeEnum.chunk || mode === TrainingModeEnum.auto) {
|
||||||
|
prompt = undefined;
|
||||||
|
}
|
||||||
|
|
||||||
const { model, maxToken, weight } = await (async () => {
|
const { model, maxToken, weight } = await (async () => {
|
||||||
if (mode === TrainingModeEnum.chunk) {
|
if (mode === TrainingModeEnum.chunk) {
|
||||||
const vectorModelData = getEmbeddingModel(vectorModel);
|
|
||||||
if (!vectorModelData) {
|
|
||||||
return Promise.reject(i18nT('common:error_embedding_not_config'));
|
|
||||||
}
|
|
||||||
return {
|
return {
|
||||||
maxToken: vectorModelData.maxToken * 1.5,
|
maxToken: getLLMMaxChunkSize(agentModelData),
|
||||||
model: vectorModelData.model,
|
model: vectorModelData.model,
|
||||||
weight: vectorModelData.weight
|
weight: vectorModelData.weight
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mode === TrainingModeEnum.qa || mode === TrainingModeEnum.auto) {
|
if (mode === TrainingModeEnum.qa || mode === TrainingModeEnum.auto) {
|
||||||
const agentModelData = getLLMModel(agentModel);
|
|
||||||
if (!agentModelData) {
|
|
||||||
return Promise.reject(i18nT('common:error_llm_not_config'));
|
|
||||||
}
|
|
||||||
return {
|
return {
|
||||||
maxToken: agentModelData.maxContext * 0.8,
|
maxToken: getLLMMaxChunkSize(agentModelData),
|
||||||
model: agentModelData.model,
|
model: agentModelData.model,
|
||||||
weight: 0
|
weight: 0
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mode === TrainingModeEnum.image) {
|
if (mode === TrainingModeEnum.image) {
|
||||||
const vllmModelData = getVlmModel(vlmModel);
|
const vllmModelData = getVlmModel(vlmModel);
|
||||||
if (!vllmModelData) {
|
if (!vllmModelData) {
|
||||||
return Promise.reject(i18nT('common:error_vlm_not_config'));
|
return Promise.reject(i18nT('common:error_vlm_not_config'));
|
||||||
}
|
}
|
||||||
return {
|
return {
|
||||||
maxToken: vllmModelData.maxContext * 0.8,
|
maxToken: getLLMMaxChunkSize(vllmModelData),
|
||||||
model: vllmModelData.model,
|
model: vllmModelData.model,
|
||||||
weight: 0
|
weight: 0
|
||||||
};
|
};
|
||||||
@ -107,10 +115,6 @@ export async function pushDataListToTrainingQueue({
|
|||||||
|
|
||||||
return Promise.reject(`Training mode "${mode}" is inValid`);
|
return Promise.reject(`Training mode "${mode}" is inValid`);
|
||||||
})();
|
})();
|
||||||
// Filter redundant params
|
|
||||||
if (mode === TrainingModeEnum.chunk || mode === TrainingModeEnum.auto) {
|
|
||||||
prompt = undefined;
|
|
||||||
}
|
|
||||||
|
|
||||||
// filter repeat or equal content
|
// filter repeat or equal content
|
||||||
const set = new Set();
|
const set = new Set();
|
||||||
@ -143,13 +147,13 @@ export async function pushDataListToTrainingQueue({
|
|||||||
|
|
||||||
const text = item.q + item.a;
|
const text = item.q + item.a;
|
||||||
|
|
||||||
|
// Oversize llm tokens
|
||||||
if (text.length > maxToken) {
|
if (text.length > maxToken) {
|
||||||
filterResult.overToken.push(item);
|
filterResult.overToken.push(item);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (set.has(text)) {
|
if (set.has(text)) {
|
||||||
console.log('repeat', item);
|
|
||||||
filterResult.repeat.push(item);
|
filterResult.repeat.push(item);
|
||||||
} else {
|
} else {
|
||||||
filterResult.success.push(item);
|
filterResult.success.push(item);
|
||||||
@ -182,6 +186,7 @@ export async function pushDataListToTrainingQueue({
|
|||||||
q: item.q,
|
q: item.q,
|
||||||
a: item.a,
|
a: item.a,
|
||||||
chunkIndex: item.chunkIndex ?? 0,
|
chunkIndex: item.chunkIndex ?? 0,
|
||||||
|
indexSize,
|
||||||
weight: weight ?? 0,
|
weight: weight ?? 0,
|
||||||
indexes: item.indexes,
|
indexes: item.indexes,
|
||||||
retryCount: 5
|
retryCount: 5
|
||||||
|
|||||||
@ -76,6 +76,7 @@ const TrainingDataSchema = new Schema({
|
|||||||
type: Number,
|
type: Number,
|
||||||
default: 0
|
default: 0
|
||||||
},
|
},
|
||||||
|
indexSize: Number,
|
||||||
weight: {
|
weight: {
|
||||||
type: Number,
|
type: Number,
|
||||||
default: 0
|
default: 0
|
||||||
|
|||||||
@ -72,7 +72,7 @@ const EditFolderModal = ({
|
|||||||
{...register('name', { required: true })}
|
{...register('name', { required: true })}
|
||||||
bg={'myGray.50'}
|
bg={'myGray.50'}
|
||||||
autoFocus
|
autoFocus
|
||||||
maxLength={20}
|
maxLength={100}
|
||||||
/>
|
/>
|
||||||
</Box>
|
</Box>
|
||||||
<Box mt={4}>
|
<Box mt={4}>
|
||||||
|
|||||||
67
packages/web/components/common/Radio/RadioGroup.tsx
Normal file
67
packages/web/components/common/Radio/RadioGroup.tsx
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
import React from 'react';
|
||||||
|
import { Box, Flex, Grid, type GridProps, HStack } from '@chakra-ui/react';
|
||||||
|
import { useTranslation } from 'next-i18next';
|
||||||
|
import QuestionTip from '../MyTooltip/QuestionTip';
|
||||||
|
|
||||||
|
type Props<T> = Omit<GridProps, 'onChange'> & {
|
||||||
|
list: {
|
||||||
|
title: string;
|
||||||
|
value: T;
|
||||||
|
tooltip?: string;
|
||||||
|
}[];
|
||||||
|
value: T;
|
||||||
|
defaultBg?: string;
|
||||||
|
activeBg?: string;
|
||||||
|
onChange: (e: T) => void;
|
||||||
|
};
|
||||||
|
|
||||||
|
const RadioGroup = <T = any,>({ list, value, onChange, ...props }: Props<T>) => {
|
||||||
|
const { t } = useTranslation();
|
||||||
|
|
||||||
|
return (
|
||||||
|
<Flex gap={[3, 5]} fontSize={['sm', 'md']} alignItems={'center'} {...props}>
|
||||||
|
{list.map((item) => (
|
||||||
|
<Flex
|
||||||
|
alignItems={'center'}
|
||||||
|
key={item.value as any}
|
||||||
|
cursor={'pointer'}
|
||||||
|
userSelect={'none'}
|
||||||
|
gap={1}
|
||||||
|
onClick={() => onChange(item.value)}
|
||||||
|
>
|
||||||
|
<Box
|
||||||
|
w={'18px'}
|
||||||
|
h={'18px'}
|
||||||
|
borderWidth={'2.4px'}
|
||||||
|
borderColor={value === item.value ? 'primary.015' : 'transparent'}
|
||||||
|
borderRadius={'50%'}
|
||||||
|
>
|
||||||
|
<Flex
|
||||||
|
w={'100%'}
|
||||||
|
h={'100%'}
|
||||||
|
borderWidth={'1px'}
|
||||||
|
borderColor={value === item.value ? 'primary.600' : 'borderColor.high'}
|
||||||
|
bg={value === item.value ? 'primary.1' : 'transparent'}
|
||||||
|
borderRadius={'50%'}
|
||||||
|
alignItems={'center'}
|
||||||
|
justifyContent={'center'}
|
||||||
|
>
|
||||||
|
<Box
|
||||||
|
w={'5px'}
|
||||||
|
h={'5px'}
|
||||||
|
borderRadius={'50%'}
|
||||||
|
bg={value === item.value ? 'primary.600' : 'transparent'}
|
||||||
|
/>
|
||||||
|
</Flex>
|
||||||
|
</Box>
|
||||||
|
<HStack spacing={1} color={'myGray.900'} whiteSpace={'nowrap'} fontSize={'sm'}>
|
||||||
|
<Box>{typeof item.title === 'string' ? t(item.title as any) : item.title}</Box>
|
||||||
|
{!!item.tooltip && <QuestionTip label={item.tooltip} color={'myGray.600'} />}
|
||||||
|
</HStack>
|
||||||
|
</Flex>
|
||||||
|
))}
|
||||||
|
</Flex>
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
export default RadioGroup;
|
||||||
@ -569,7 +569,6 @@
|
|||||||
"core.dataset.import.Custom process": "Custom Rules",
|
"core.dataset.import.Custom process": "Custom Rules",
|
||||||
"core.dataset.import.Custom process desc": "Customize segmentation and preprocessing rules",
|
"core.dataset.import.Custom process desc": "Customize segmentation and preprocessing rules",
|
||||||
"core.dataset.import.Custom prompt": "Custom Prompt",
|
"core.dataset.import.Custom prompt": "Custom Prompt",
|
||||||
"core.dataset.import.Custom split char": "Custom Separator",
|
|
||||||
"core.dataset.import.Custom text": "Custom Text",
|
"core.dataset.import.Custom text": "Custom Text",
|
||||||
"core.dataset.import.Custom text desc": "Manually enter a piece of text as a dataset",
|
"core.dataset.import.Custom text desc": "Manually enter a piece of text as a dataset",
|
||||||
"core.dataset.import.Data process params": "Data Processing Parameters",
|
"core.dataset.import.Data process params": "Data Processing Parameters",
|
||||||
|
|||||||
@ -27,7 +27,6 @@
|
|||||||
"custom_data_process_params": "Custom",
|
"custom_data_process_params": "Custom",
|
||||||
"custom_data_process_params_desc": "Customize data processing rules",
|
"custom_data_process_params_desc": "Customize data processing rules",
|
||||||
"custom_split_sign_tip": "Allows you to chunk according to custom delimiters. \nUsually used for processed data, using specific separators for precise chunking. \nYou can use the | symbol to represent multiple splitters, such as: \".|.\" to represent a period in Chinese and English.\n\nTry to avoid using special symbols related to regular, such as: * () [] {}, etc.",
|
"custom_split_sign_tip": "Allows you to chunk according to custom delimiters. \nUsually used for processed data, using specific separators for precise chunking. \nYou can use the | symbol to represent multiple splitters, such as: \".|.\" to represent a period in Chinese and English.\n\nTry to avoid using special symbols related to regular, such as: * () [] {}, etc.",
|
||||||
"data.ideal_chunk_length": "ideal block length",
|
|
||||||
"data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes",
|
"data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes",
|
||||||
"data_index_num": "Index {{index}}",
|
"data_index_num": "Index {{index}}",
|
||||||
"data_process_params": "Params",
|
"data_process_params": "Params",
|
||||||
@ -53,8 +52,6 @@
|
|||||||
"file_model_function_tip": "Enhances indexing and QA generation",
|
"file_model_function_tip": "Enhances indexing and QA generation",
|
||||||
"filename": "Filename",
|
"filename": "Filename",
|
||||||
"folder_dataset": "Folder",
|
"folder_dataset": "Folder",
|
||||||
"ideal_chunk_length": "ideal block length",
|
|
||||||
"ideal_chunk_length_tips": "Segment according to the end symbol and combine multiple segments into one block. This value determines the estimated size of the block, if there is any fluctuation.",
|
|
||||||
"image_auto_parse": "Automatic image indexing",
|
"image_auto_parse": "Automatic image indexing",
|
||||||
"image_auto_parse_tips": "Call VLM to automatically label the pictures in the document and generate additional search indexes",
|
"image_auto_parse_tips": "Call VLM to automatically label the pictures in the document and generate additional search indexes",
|
||||||
"image_training_queue": "Queue of image processing",
|
"image_training_queue": "Queue of image processing",
|
||||||
@ -68,6 +65,8 @@
|
|||||||
"import_param_setting": "Parameter settings",
|
"import_param_setting": "Parameter settings",
|
||||||
"import_select_file": "Select a file",
|
"import_select_file": "Select a file",
|
||||||
"import_select_link": "Enter link",
|
"import_select_link": "Enter link",
|
||||||
|
"index_size": "Index size",
|
||||||
|
"index_size_tips": "When vectorized, the system will automatically further segment the blocks according to this size.",
|
||||||
"is_open_schedule": "Enable scheduled synchronization",
|
"is_open_schedule": "Enable scheduled synchronization",
|
||||||
"keep_image": "Keep the picture",
|
"keep_image": "Keep the picture",
|
||||||
"move.hint": "After moving, the selected knowledge base/folder will inherit the permission settings of the new folder, and the original permission settings will become invalid.",
|
"move.hint": "After moving, the selected knowledge base/folder will inherit the permission settings of the new folder, and the original permission settings will become invalid.",
|
||||||
@ -89,6 +88,8 @@
|
|||||||
"retain_collection": "Adjust Training Parameters",
|
"retain_collection": "Adjust Training Parameters",
|
||||||
"retrain_task_submitted": "The retraining task has been submitted",
|
"retrain_task_submitted": "The retraining task has been submitted",
|
||||||
"same_api_collection": "The same API set exists",
|
"same_api_collection": "The same API set exists",
|
||||||
|
"split_chunk_char": "Block by specified splitter",
|
||||||
|
"split_chunk_size": "Block by length",
|
||||||
"split_sign_break": "1 newline character",
|
"split_sign_break": "1 newline character",
|
||||||
"split_sign_break2": "2 newline characters",
|
"split_sign_break2": "2 newline characters",
|
||||||
"split_sign_custom": "Customize",
|
"split_sign_custom": "Customize",
|
||||||
|
|||||||
@ -573,7 +573,6 @@
|
|||||||
"core.dataset.import.Custom process": "自定义规则",
|
"core.dataset.import.Custom process": "自定义规则",
|
||||||
"core.dataset.import.Custom process desc": "自定义设置数据处理规则",
|
"core.dataset.import.Custom process desc": "自定义设置数据处理规则",
|
||||||
"core.dataset.import.Custom prompt": "自定义提示词",
|
"core.dataset.import.Custom prompt": "自定义提示词",
|
||||||
"core.dataset.import.Custom split char": "自定义分隔符",
|
|
||||||
"core.dataset.import.Custom text": "自定义文本",
|
"core.dataset.import.Custom text": "自定义文本",
|
||||||
"core.dataset.import.Custom text desc": "手动输入一段文本作为数据集",
|
"core.dataset.import.Custom text desc": "手动输入一段文本作为数据集",
|
||||||
"core.dataset.import.Data process params": "数据处理参数",
|
"core.dataset.import.Data process params": "数据处理参数",
|
||||||
|
|||||||
@ -27,7 +27,6 @@
|
|||||||
"custom_data_process_params": "自定义",
|
"custom_data_process_params": "自定义",
|
||||||
"custom_data_process_params_desc": "自定义设置数据处理规则",
|
"custom_data_process_params_desc": "自定义设置数据处理规则",
|
||||||
"custom_split_sign_tip": "允许你根据自定义的分隔符进行分块。通常用于已处理好的数据,使用特定的分隔符来精确分块。可以使用 | 符号表示多个分割符,例如:“。|.” 表示中英文句号。\n尽量避免使用正则相关特殊符号,例如: * () [] {} 等。",
|
"custom_split_sign_tip": "允许你根据自定义的分隔符进行分块。通常用于已处理好的数据,使用特定的分隔符来精确分块。可以使用 | 符号表示多个分割符,例如:“。|.” 表示中英文句号。\n尽量避免使用正则相关特殊符号,例如: * () [] {} 等。",
|
||||||
"data.ideal_chunk_length": "理想分块长度",
|
|
||||||
"data_amount": "{{dataAmount}} 组数据, {{indexAmount}} 组索引",
|
"data_amount": "{{dataAmount}} 组数据, {{indexAmount}} 组索引",
|
||||||
"data_index_num": "索引 {{index}}",
|
"data_index_num": "索引 {{index}}",
|
||||||
"data_process_params": "处理参数",
|
"data_process_params": "处理参数",
|
||||||
@ -53,8 +52,6 @@
|
|||||||
"file_model_function_tip": "用于增强索引和 QA 生成",
|
"file_model_function_tip": "用于增强索引和 QA 生成",
|
||||||
"filename": "文件名",
|
"filename": "文件名",
|
||||||
"folder_dataset": "文件夹",
|
"folder_dataset": "文件夹",
|
||||||
"ideal_chunk_length": "理想分块长度",
|
|
||||||
"ideal_chunk_length_tips": "按结束符号进行分段,并将多个分段组成一个分块,该值决定了分块的预估大小,如果会有上下浮动。",
|
|
||||||
"image_auto_parse": "图片自动索引",
|
"image_auto_parse": "图片自动索引",
|
||||||
"image_auto_parse_tips": "调用 VLM 自动标注文档里的图片,并生成额外的检索索引",
|
"image_auto_parse_tips": "调用 VLM 自动标注文档里的图片,并生成额外的检索索引",
|
||||||
"image_training_queue": "图片处理排队",
|
"image_training_queue": "图片处理排队",
|
||||||
@ -68,6 +65,8 @@
|
|||||||
"import_param_setting": "参数设置",
|
"import_param_setting": "参数设置",
|
||||||
"import_select_file": "选择文件",
|
"import_select_file": "选择文件",
|
||||||
"import_select_link": "输入链接",
|
"import_select_link": "输入链接",
|
||||||
|
"index_size": "索引大小",
|
||||||
|
"index_size_tips": "向量化时内容的长度,系统会自动按该大小对分块进行进一步的分割。",
|
||||||
"is_open_schedule": "启用定时同步",
|
"is_open_schedule": "启用定时同步",
|
||||||
"keep_image": "保留图片",
|
"keep_image": "保留图片",
|
||||||
"move.hint": "移动后,所选知识库/文件夹将继承新文件夹的权限设置,原先的权限设置失效。",
|
"move.hint": "移动后,所选知识库/文件夹将继承新文件夹的权限设置,原先的权限设置失效。",
|
||||||
@ -89,6 +88,8 @@
|
|||||||
"retain_collection": "调整训练参数",
|
"retain_collection": "调整训练参数",
|
||||||
"retrain_task_submitted": "重新训练任务已提交",
|
"retrain_task_submitted": "重新训练任务已提交",
|
||||||
"same_api_collection": "存在相同的 API 集合",
|
"same_api_collection": "存在相同的 API 集合",
|
||||||
|
"split_chunk_char": "按指定分割符分块",
|
||||||
|
"split_chunk_size": "按长度分块",
|
||||||
"split_sign_break": "1 个换行符",
|
"split_sign_break": "1 个换行符",
|
||||||
"split_sign_break2": "2 个换行符",
|
"split_sign_break2": "2 个换行符",
|
||||||
"split_sign_custom": "自定义",
|
"split_sign_custom": "自定义",
|
||||||
|
|||||||
@ -568,7 +568,6 @@
|
|||||||
"core.dataset.import.Custom process": "自訂規則",
|
"core.dataset.import.Custom process": "自訂規則",
|
||||||
"core.dataset.import.Custom process desc": "自訂設定資料處理規則",
|
"core.dataset.import.Custom process desc": "自訂設定資料處理規則",
|
||||||
"core.dataset.import.Custom prompt": "自訂提示詞",
|
"core.dataset.import.Custom prompt": "自訂提示詞",
|
||||||
"core.dataset.import.Custom split char": "自訂分隔符",
|
|
||||||
"core.dataset.import.Custom text": "自訂文字",
|
"core.dataset.import.Custom text": "自訂文字",
|
||||||
"core.dataset.import.Custom text desc": "手動輸入一段文字作為資料集",
|
"core.dataset.import.Custom text desc": "手動輸入一段文字作為資料集",
|
||||||
"core.dataset.import.Data process params": "資料處理參數",
|
"core.dataset.import.Data process params": "資料處理參數",
|
||||||
|
|||||||
@ -27,7 +27,6 @@
|
|||||||
"custom_data_process_params": "自訂",
|
"custom_data_process_params": "自訂",
|
||||||
"custom_data_process_params_desc": "自訂資料處理規則",
|
"custom_data_process_params_desc": "自訂資料處理規則",
|
||||||
"custom_split_sign_tip": "允許你根據自定義的分隔符進行分塊。\n通常用於已處理好的數據,使用特定的分隔符來精確分塊。\n可以使用 | 符號表示多個分割符,例如:“。|.” 表示中英文句號。\n\n盡量避免使用正則相關特殊符號,例如: * () [] {} 等。",
|
"custom_split_sign_tip": "允許你根據自定義的分隔符進行分塊。\n通常用於已處理好的數據,使用特定的分隔符來精確分塊。\n可以使用 | 符號表示多個分割符,例如:“。|.” 表示中英文句號。\n\n盡量避免使用正則相關特殊符號,例如: * () [] {} 等。",
|
||||||
"data.ideal_chunk_length": "理想分塊長度",
|
|
||||||
"data_amount": "{{dataAmount}} 組數據, {{indexAmount}} 組索引",
|
"data_amount": "{{dataAmount}} 組數據, {{indexAmount}} 組索引",
|
||||||
"data_index_num": "索引 {{index}}",
|
"data_index_num": "索引 {{index}}",
|
||||||
"data_process_params": "處理參數",
|
"data_process_params": "處理參數",
|
||||||
@ -53,8 +52,6 @@
|
|||||||
"file_model_function_tip": "用於增強索引和問答生成",
|
"file_model_function_tip": "用於增強索引和問答生成",
|
||||||
"filename": "檔案名稱",
|
"filename": "檔案名稱",
|
||||||
"folder_dataset": "資料夾",
|
"folder_dataset": "資料夾",
|
||||||
"ideal_chunk_length": "理想分塊長度",
|
|
||||||
"ideal_chunk_length_tips": "依結束符號進行分段,並將多個分段組成一個分塊,此值決定了分塊的預估大小,可能會有上下浮動。",
|
|
||||||
"image_auto_parse": "圖片自動索引",
|
"image_auto_parse": "圖片自動索引",
|
||||||
"image_auto_parse_tips": "調用 VLM 自動標註文檔裡的圖片,並生成額外的檢索索引",
|
"image_auto_parse_tips": "調用 VLM 自動標註文檔裡的圖片,並生成額外的檢索索引",
|
||||||
"image_training_queue": "圖片處理排隊",
|
"image_training_queue": "圖片處理排隊",
|
||||||
@ -68,6 +65,8 @@
|
|||||||
"import_param_setting": "參數設置",
|
"import_param_setting": "參數設置",
|
||||||
"import_select_file": "選擇文件",
|
"import_select_file": "選擇文件",
|
||||||
"import_select_link": "輸入鏈接",
|
"import_select_link": "輸入鏈接",
|
||||||
|
"index_size": "索引大小",
|
||||||
|
"index_size_tips": "向量化時內容的長度,系統會自動按該大小對分塊進行進一步的分割。",
|
||||||
"is_open_schedule": "啟用定時同步",
|
"is_open_schedule": "啟用定時同步",
|
||||||
"keep_image": "保留圖片",
|
"keep_image": "保留圖片",
|
||||||
"move.hint": "移動後,所選資料集/資料夾將繼承新資料夾的權限設定,原先的權限設定將失效。",
|
"move.hint": "移動後,所選資料集/資料夾將繼承新資料夾的權限設定,原先的權限設定將失效。",
|
||||||
@ -89,6 +88,8 @@
|
|||||||
"retain_collection": "調整訓練參數",
|
"retain_collection": "調整訓練參數",
|
||||||
"retrain_task_submitted": "重新訓練任務已提交",
|
"retrain_task_submitted": "重新訓練任務已提交",
|
||||||
"same_api_collection": "存在相同的 API 集合",
|
"same_api_collection": "存在相同的 API 集合",
|
||||||
|
"split_chunk_char": "按指定分割符分塊",
|
||||||
|
"split_chunk_size": "按長度分塊",
|
||||||
"split_sign_break": "1 個換行符",
|
"split_sign_break": "1 個換行符",
|
||||||
"split_sign_break2": "2 個換行符",
|
"split_sign_break2": "2 個換行符",
|
||||||
"split_sign_custom": "自定義",
|
"split_sign_custom": "自定義",
|
||||||
|
|||||||
@ -71,7 +71,7 @@ const EditResourceModal = ({
|
|||||||
{...register('name', { required: true })}
|
{...register('name', { required: true })}
|
||||||
bg={'myGray.50'}
|
bg={'myGray.50'}
|
||||||
autoFocus
|
autoFocus
|
||||||
maxLength={20}
|
maxLength={100}
|
||||||
/>
|
/>
|
||||||
</HStack>
|
</HStack>
|
||||||
</Box>
|
</Box>
|
||||||
|
|||||||
@ -338,7 +338,7 @@ function EditKeyModal({
|
|||||||
<FormLabel flex={'0 0 90px'}>{t('common:Name')}</FormLabel>
|
<FormLabel flex={'0 0 90px'}>{t('common:Name')}</FormLabel>
|
||||||
<Input
|
<Input
|
||||||
placeholder={t('publish:key_alias') || 'key_alias'}
|
placeholder={t('publish:key_alias') || 'key_alias'}
|
||||||
maxLength={20}
|
maxLength={100}
|
||||||
{...register('name', {
|
{...register('name', {
|
||||||
required: t('common:common.name_is_empty') || 'name_is_empty'
|
required: t('common:common.name_is_empty') || 'name_is_empty'
|
||||||
})}
|
})}
|
||||||
|
|||||||
@ -117,7 +117,7 @@ function EditModal({
|
|||||||
ml={4}
|
ml={4}
|
||||||
autoFocus
|
autoFocus
|
||||||
bg={'myWhite.600'}
|
bg={'myWhite.600'}
|
||||||
maxLength={20}
|
maxLength={100}
|
||||||
placeholder={t('user:team.Team Name')}
|
placeholder={t('user:team.Team Name')}
|
||||||
{...register('name', {
|
{...register('name', {
|
||||||
required: t('common:common.Please Input Name')
|
required: t('common:common.Please Input Name')
|
||||||
|
|||||||
@ -326,7 +326,7 @@ function EditLinkModal({
|
|||||||
<FormLabel flex={'0 0 90px'}>{t('common:Name')}</FormLabel>
|
<FormLabel flex={'0 0 90px'}>{t('common:Name')}</FormLabel>
|
||||||
<Input
|
<Input
|
||||||
placeholder={t('publish:link_name')}
|
placeholder={t('publish:link_name')}
|
||||||
maxLength={20}
|
maxLength={100}
|
||||||
{...register('name', {
|
{...register('name', {
|
||||||
required: t('common:common.name_is_empty')
|
required: t('common:common.name_is_empty')
|
||||||
})}
|
})}
|
||||||
|
|||||||
@ -26,7 +26,7 @@ function BasicInfo({
|
|||||||
</FormLabel>
|
</FormLabel>
|
||||||
<Input
|
<Input
|
||||||
placeholder={t('publish:publish_name')}
|
placeholder={t('publish:publish_name')}
|
||||||
maxLength={20}
|
maxLength={100}
|
||||||
{...register('name', {
|
{...register('name', {
|
||||||
required: t('common:common.name_is_empty')
|
required: t('common:common.name_is_empty')
|
||||||
})}
|
})}
|
||||||
|
|||||||
@ -96,7 +96,7 @@ const ExtractFieldModal = ({
|
|||||||
<Input
|
<Input
|
||||||
bg={'myGray.50'}
|
bg={'myGray.50'}
|
||||||
placeholder="name/age/sql"
|
placeholder="name/age/sql"
|
||||||
maxLength={20}
|
maxLength={100}
|
||||||
{...register('key', { required: true })}
|
{...register('key', { required: true })}
|
||||||
/>
|
/>
|
||||||
</Flex>
|
</Flex>
|
||||||
|
|||||||
@ -418,7 +418,7 @@ const NodeCard = (props: Props) => {
|
|||||||
{RenderToolHandle}
|
{RenderToolHandle}
|
||||||
|
|
||||||
<ConfirmSyncModal />
|
<ConfirmSyncModal />
|
||||||
<EditTitleModal maxLength={50} />
|
<EditTitleModal maxLength={100} />
|
||||||
</Flex>
|
</Flex>
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|||||||
@ -319,7 +319,7 @@ const TemplateMarketModal = ({
|
|||||||
onChange={(e) => setCurrentSearch(e.target.value)}
|
onChange={(e) => setCurrentSearch(e.target.value)}
|
||||||
h={8}
|
h={8}
|
||||||
bg={'myGray.50'}
|
bg={'myGray.50'}
|
||||||
maxLength={20}
|
maxLength={100}
|
||||||
borderRadius={'sm'}
|
borderRadius={'sm'}
|
||||||
/>
|
/>
|
||||||
</Box>
|
</Box>
|
||||||
|
|||||||
@ -49,7 +49,7 @@ const EditFolderModal = ({
|
|||||||
defaultValue={name}
|
defaultValue={name}
|
||||||
placeholder={t('common:dataset.Folder Name') || ''}
|
placeholder={t('common:dataset.Folder Name') || ''}
|
||||||
autoFocus
|
autoFocus
|
||||||
maxLength={20}
|
maxLength={100}
|
||||||
/>
|
/>
|
||||||
</ModalBody>
|
</ModalBody>
|
||||||
<ModalFooter>
|
<ModalFooter>
|
||||||
|
|||||||
@ -10,11 +10,21 @@ import { useMyStep } from '@fastgpt/web/hooks/useStep';
|
|||||||
import { Box, Button, Flex, IconButton } from '@chakra-ui/react';
|
import { Box, Button, Flex, IconButton } from '@chakra-ui/react';
|
||||||
import MyIcon from '@fastgpt/web/components/common/Icon';
|
import MyIcon from '@fastgpt/web/components/common/Icon';
|
||||||
import { TabEnum } from '../NavBar';
|
import { TabEnum } from '../NavBar';
|
||||||
import { ChunkSettingModeEnum } from '@/web/core/dataset/constants';
|
import { ChunkSettingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||||
import { UseFormReturn, useForm } from 'react-hook-form';
|
import { UseFormReturn, useForm } from 'react-hook-form';
|
||||||
import { ImportSourceItemType } from '@/web/core/dataset/type';
|
import { ImportSourceItemType } from '@/web/core/dataset/type';
|
||||||
import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent';
|
import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent';
|
||||||
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
|
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
|
||||||
|
import { DataChunkSplitModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||||
|
import {
|
||||||
|
getMaxChunkSize,
|
||||||
|
getLLMDefaultChunkSize,
|
||||||
|
getLLMMaxChunkSize,
|
||||||
|
chunkAutoChunkSize,
|
||||||
|
minChunkSize,
|
||||||
|
getAutoIndexSize,
|
||||||
|
getMaxIndexSize
|
||||||
|
} from '@fastgpt/global/core/dataset/training/utils';
|
||||||
|
|
||||||
type TrainingFiledType = {
|
type TrainingFiledType = {
|
||||||
chunkOverlapRatio: number;
|
chunkOverlapRatio: number;
|
||||||
@ -22,6 +32,9 @@ type TrainingFiledType = {
|
|||||||
minChunkSize: number;
|
minChunkSize: number;
|
||||||
autoChunkSize: number;
|
autoChunkSize: number;
|
||||||
chunkSize: number;
|
chunkSize: number;
|
||||||
|
maxIndexSize?: number;
|
||||||
|
indexSize?: number;
|
||||||
|
autoIndexSize?: number;
|
||||||
charsPointsPrice: number;
|
charsPointsPrice: number;
|
||||||
priceTip: string;
|
priceTip: string;
|
||||||
uploadRate: number;
|
uploadRate: number;
|
||||||
@ -47,9 +60,13 @@ export type ImportFormType = {
|
|||||||
autoIndexes: boolean;
|
autoIndexes: boolean;
|
||||||
|
|
||||||
chunkSettingMode: ChunkSettingModeEnum;
|
chunkSettingMode: ChunkSettingModeEnum;
|
||||||
|
|
||||||
|
chunkSplitMode: DataChunkSplitModeEnum;
|
||||||
embeddingChunkSize: number;
|
embeddingChunkSize: number;
|
||||||
qaChunkSize: number;
|
qaChunkSize: number;
|
||||||
customSplitChar: string;
|
chunkSplitter: string;
|
||||||
|
indexSize: number;
|
||||||
|
|
||||||
qaPrompt: string;
|
qaPrompt: string;
|
||||||
webSelector: string;
|
webSelector: string;
|
||||||
};
|
};
|
||||||
@ -199,9 +216,12 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
|
|||||||
trainingType: DatasetCollectionDataProcessModeEnum.chunk,
|
trainingType: DatasetCollectionDataProcessModeEnum.chunk,
|
||||||
|
|
||||||
chunkSettingMode: ChunkSettingModeEnum.auto,
|
chunkSettingMode: ChunkSettingModeEnum.auto,
|
||||||
embeddingChunkSize: vectorModel?.defaultToken || 512,
|
|
||||||
qaChunkSize: Math.min(agentModel.maxResponse * 1, agentModel.maxContext * 0.7),
|
chunkSplitMode: DataChunkSplitModeEnum.size,
|
||||||
customSplitChar: '',
|
embeddingChunkSize: 2000,
|
||||||
|
indexSize: vectorModel?.defaultToken || 512,
|
||||||
|
qaChunkSize: getLLMDefaultChunkSize(agentModel),
|
||||||
|
chunkSplitter: '',
|
||||||
qaPrompt: Prompt_AgentQA.description,
|
qaPrompt: Prompt_AgentQA.description,
|
||||||
webSelector: '',
|
webSelector: '',
|
||||||
customPdfParse: false
|
customPdfParse: false
|
||||||
@ -215,17 +235,18 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
|
|||||||
const chunkSettingMode = processParamsForm.watch('chunkSettingMode');
|
const chunkSettingMode = processParamsForm.watch('chunkSettingMode');
|
||||||
const embeddingChunkSize = processParamsForm.watch('embeddingChunkSize');
|
const embeddingChunkSize = processParamsForm.watch('embeddingChunkSize');
|
||||||
const qaChunkSize = processParamsForm.watch('qaChunkSize');
|
const qaChunkSize = processParamsForm.watch('qaChunkSize');
|
||||||
const customSplitChar = processParamsForm.watch('customSplitChar');
|
const chunkSplitter = processParamsForm.watch('chunkSplitter');
|
||||||
const autoIndexes = processParamsForm.watch('autoIndexes');
|
const autoIndexes = processParamsForm.watch('autoIndexes');
|
||||||
|
const indexSize = processParamsForm.watch('indexSize');
|
||||||
|
|
||||||
const TrainingModeMap = useMemo<TrainingFiledType>(() => {
|
const TrainingModeMap = useMemo<TrainingFiledType>(() => {
|
||||||
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
|
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
|
||||||
return {
|
return {
|
||||||
chunkSizeField: 'qaChunkSize',
|
chunkSizeField: 'qaChunkSize',
|
||||||
chunkOverlapRatio: 0,
|
chunkOverlapRatio: 0,
|
||||||
maxChunkSize: Math.min(agentModel.maxResponse * 4, agentModel.maxContext * 0.7),
|
maxChunkSize: getLLMMaxChunkSize(agentModel),
|
||||||
minChunkSize: 4000,
|
minChunkSize: 1000,
|
||||||
autoChunkSize: Math.min(agentModel.maxResponse * 1, agentModel.maxContext * 0.7),
|
autoChunkSize: getLLMDefaultChunkSize(agentModel),
|
||||||
chunkSize: qaChunkSize,
|
chunkSize: qaChunkSize,
|
||||||
charsPointsPrice: agentModel.charsPointsPrice || 0,
|
charsPointsPrice: agentModel.charsPointsPrice || 0,
|
||||||
priceTip: t('dataset:import.Auto mode Estimated Price Tips', {
|
priceTip: t('dataset:import.Auto mode Estimated Price Tips', {
|
||||||
@ -237,10 +258,13 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
|
|||||||
return {
|
return {
|
||||||
chunkSizeField: 'embeddingChunkSize',
|
chunkSizeField: 'embeddingChunkSize',
|
||||||
chunkOverlapRatio: 0.2,
|
chunkOverlapRatio: 0.2,
|
||||||
maxChunkSize: 2048,
|
maxChunkSize: getMaxChunkSize(agentModel),
|
||||||
minChunkSize: 100,
|
minChunkSize: minChunkSize,
|
||||||
autoChunkSize: vectorModel?.defaultToken ? vectorModel.defaultToken * 2 : 1024,
|
autoChunkSize: chunkAutoChunkSize,
|
||||||
chunkSize: embeddingChunkSize,
|
chunkSize: embeddingChunkSize,
|
||||||
|
maxIndexSize: getMaxIndexSize(vectorModel),
|
||||||
|
autoIndexSize: getAutoIndexSize(vectorModel),
|
||||||
|
indexSize,
|
||||||
charsPointsPrice: agentModel.charsPointsPrice || 0,
|
charsPointsPrice: agentModel.charsPointsPrice || 0,
|
||||||
priceTip: t('dataset:import.Auto mode Estimated Price Tips', {
|
priceTip: t('dataset:import.Auto mode Estimated Price Tips', {
|
||||||
price: agentModel.charsPointsPrice
|
price: agentModel.charsPointsPrice
|
||||||
@ -251,10 +275,13 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
|
|||||||
return {
|
return {
|
||||||
chunkSizeField: 'embeddingChunkSize',
|
chunkSizeField: 'embeddingChunkSize',
|
||||||
chunkOverlapRatio: 0.2,
|
chunkOverlapRatio: 0.2,
|
||||||
maxChunkSize: vectorModel?.maxToken || 512,
|
maxChunkSize: getMaxChunkSize(agentModel),
|
||||||
minChunkSize: 100,
|
minChunkSize: minChunkSize,
|
||||||
autoChunkSize: vectorModel?.defaultToken || 512,
|
autoChunkSize: chunkAutoChunkSize,
|
||||||
chunkSize: embeddingChunkSize,
|
chunkSize: embeddingChunkSize,
|
||||||
|
maxIndexSize: getMaxIndexSize(vectorModel),
|
||||||
|
autoIndexSize: getAutoIndexSize(vectorModel),
|
||||||
|
indexSize,
|
||||||
charsPointsPrice: vectorModel.charsPointsPrice || 0,
|
charsPointsPrice: vectorModel.charsPointsPrice || 0,
|
||||||
priceTip: t('dataset:import.Embedding Estimated Price Tips', {
|
priceTip: t('dataset:import.Embedding Estimated Price Tips', {
|
||||||
price: vectorModel.charsPointsPrice
|
price: vectorModel.charsPointsPrice
|
||||||
@ -265,30 +292,36 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
|
|||||||
}, [
|
}, [
|
||||||
trainingType,
|
trainingType,
|
||||||
autoIndexes,
|
autoIndexes,
|
||||||
agentModel.maxResponse,
|
agentModel,
|
||||||
agentModel.maxContext,
|
|
||||||
agentModel.charsPointsPrice,
|
|
||||||
qaChunkSize,
|
qaChunkSize,
|
||||||
t,
|
t,
|
||||||
vectorModel.defaultToken,
|
embeddingChunkSize,
|
||||||
vectorModel?.maxToken,
|
vectorModel,
|
||||||
vectorModel.charsPointsPrice,
|
indexSize
|
||||||
embeddingChunkSize
|
|
||||||
]);
|
]);
|
||||||
|
|
||||||
const chunkSettingModeMap = useMemo(() => {
|
const chunkSettingModeMap = useMemo(() => {
|
||||||
if (chunkSettingMode === ChunkSettingModeEnum.auto) {
|
if (chunkSettingMode === ChunkSettingModeEnum.auto) {
|
||||||
return {
|
return {
|
||||||
chunkSize: TrainingModeMap.autoChunkSize,
|
chunkSize: TrainingModeMap.autoChunkSize,
|
||||||
customSplitChar: ''
|
indexSize: TrainingModeMap.autoIndexSize,
|
||||||
|
chunkSplitter: ''
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
return {
|
return {
|
||||||
chunkSize: TrainingModeMap.chunkSize,
|
chunkSize: TrainingModeMap.chunkSize,
|
||||||
customSplitChar
|
indexSize: TrainingModeMap.indexSize,
|
||||||
|
chunkSplitter
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}, [chunkSettingMode, TrainingModeMap.autoChunkSize, TrainingModeMap.chunkSize, customSplitChar]);
|
}, [
|
||||||
|
chunkSettingMode,
|
||||||
|
TrainingModeMap.autoChunkSize,
|
||||||
|
TrainingModeMap.autoIndexSize,
|
||||||
|
TrainingModeMap.chunkSize,
|
||||||
|
TrainingModeMap.indexSize,
|
||||||
|
chunkSplitter
|
||||||
|
]);
|
||||||
|
|
||||||
const contextValue = {
|
const contextValue = {
|
||||||
...TrainingModeMap,
|
...TrainingModeMap,
|
||||||
|
|||||||
@ -20,10 +20,11 @@ import MyIcon from '@fastgpt/web/components/common/Icon';
|
|||||||
import { useTranslation } from 'next-i18next';
|
import { useTranslation } from 'next-i18next';
|
||||||
import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio';
|
import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio';
|
||||||
import {
|
import {
|
||||||
|
DataChunkSplitModeEnum,
|
||||||
DatasetCollectionDataProcessModeEnum,
|
DatasetCollectionDataProcessModeEnum,
|
||||||
DatasetCollectionDataProcessModeMap
|
DatasetCollectionDataProcessModeMap
|
||||||
} from '@fastgpt/global/core/dataset/constants';
|
} from '@fastgpt/global/core/dataset/constants';
|
||||||
import { ChunkSettingModeEnum } from '@/web/core/dataset/constants';
|
import { ChunkSettingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||||
import MyTooltip from '@fastgpt/web/components/common/MyTooltip';
|
import MyTooltip from '@fastgpt/web/components/common/MyTooltip';
|
||||||
import { useSystemStore } from '@/web/common/system/useSystemStore';
|
import { useSystemStore } from '@/web/common/system/useSystemStore';
|
||||||
import MyModal from '@fastgpt/web/components/common/MyModal';
|
import MyModal from '@fastgpt/web/components/common/MyModal';
|
||||||
@ -37,25 +38,39 @@ import QuestionTip from '@fastgpt/web/components/common/MyTooltip/QuestionTip';
|
|||||||
import { shadowLight } from '@fastgpt/web/styles/theme';
|
import { shadowLight } from '@fastgpt/web/styles/theme';
|
||||||
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
|
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
|
||||||
import MySelect from '@fastgpt/web/components/common/MySelect';
|
import MySelect from '@fastgpt/web/components/common/MySelect';
|
||||||
|
import { getIndexSizeSelectList } from '@fastgpt/global/core/dataset/training/utils';
|
||||||
|
import RadioGroup from '@fastgpt/web/components/common/Radio/RadioGroup';
|
||||||
|
|
||||||
function DataProcess() {
|
function DataProcess() {
|
||||||
const { t } = useTranslation();
|
const { t } = useTranslation();
|
||||||
const { feConfigs } = useSystemStore();
|
const { feConfigs } = useSystemStore();
|
||||||
|
|
||||||
const { goToNext, processParamsForm, chunkSizeField, minChunkSize, maxChunkSize } =
|
const {
|
||||||
useContextSelector(DatasetImportContext, (v) => v);
|
goToNext,
|
||||||
|
processParamsForm,
|
||||||
|
chunkSizeField,
|
||||||
|
minChunkSize,
|
||||||
|
maxChunkSize,
|
||||||
|
maxIndexSize,
|
||||||
|
indexSize
|
||||||
|
} = useContextSelector(DatasetImportContext, (v) => v);
|
||||||
const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail);
|
const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail);
|
||||||
const { setValue, register, watch, getValues } = processParamsForm;
|
const { setValue, register, watch, getValues } = processParamsForm;
|
||||||
|
|
||||||
const trainingType = watch('trainingType');
|
const trainingType = watch('trainingType');
|
||||||
const chunkSettingMode = watch('chunkSettingMode');
|
const trainingModeList = useMemo(() => {
|
||||||
|
const list = Object.entries(DatasetCollectionDataProcessModeMap);
|
||||||
|
return list
|
||||||
|
.filter(([key]) => key !== DatasetCollectionDataProcessModeEnum.auto)
|
||||||
|
.map(([key, value]) => ({
|
||||||
|
title: t(value.label as any),
|
||||||
|
value: key as DatasetCollectionDataProcessModeEnum,
|
||||||
|
tooltip: t(value.tooltip as any)
|
||||||
|
}));
|
||||||
|
}, [t]);
|
||||||
|
|
||||||
const qaPrompt = watch('qaPrompt');
|
const chunkSettingMode = watch('chunkSettingMode');
|
||||||
const {
|
const chunkSplitMode = watch('chunkSplitMode');
|
||||||
isOpen: isOpenCustomPrompt,
|
|
||||||
onOpen: onOpenCustomPrompt,
|
|
||||||
onClose: onCloseCustomPrompt
|
|
||||||
} = useDisclosure();
|
|
||||||
|
|
||||||
const customSplitList = [
|
const customSplitList = [
|
||||||
{ label: t('dataset:split_sign_null'), value: '' },
|
{ label: t('dataset:split_sign_null'), value: '' },
|
||||||
@ -69,25 +84,25 @@ function DataProcess() {
|
|||||||
{ label: t('dataset:split_sign_custom'), value: 'Other' }
|
{ label: t('dataset:split_sign_custom'), value: 'Other' }
|
||||||
];
|
];
|
||||||
|
|
||||||
const [customListSelectValue, setCustomListSelectValue] = useState(getValues('customSplitChar'));
|
const [customListSelectValue, setCustomListSelectValue] = useState(getValues('chunkSplitter'));
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
if (customListSelectValue === 'Other') {
|
if (customListSelectValue === 'Other') {
|
||||||
setValue('customSplitChar', '');
|
setValue('chunkSplitter', '');
|
||||||
} else {
|
} else {
|
||||||
setValue('customSplitChar', customListSelectValue);
|
setValue('chunkSplitter', customListSelectValue);
|
||||||
}
|
}
|
||||||
}, [customListSelectValue, setValue]);
|
}, [customListSelectValue, setValue]);
|
||||||
|
|
||||||
const trainingModeList = useMemo(() => {
|
// Index size
|
||||||
const list = Object.entries(DatasetCollectionDataProcessModeMap);
|
const indexSizeSeletorList = useMemo(() => getIndexSizeSelectList(maxIndexSize), [maxIndexSize]);
|
||||||
return list
|
|
||||||
.filter(([key]) => key !== DatasetCollectionDataProcessModeEnum.auto)
|
// QA
|
||||||
.map(([key, value]) => ({
|
const qaPrompt = watch('qaPrompt');
|
||||||
title: t(value.label as any),
|
const {
|
||||||
value: key as DatasetCollectionDataProcessModeEnum,
|
isOpen: isOpenCustomPrompt,
|
||||||
tooltip: t(value.tooltip as any)
|
onOpen: onOpenCustomPrompt,
|
||||||
}));
|
onClose: onCloseCustomPrompt
|
||||||
}, [t]);
|
} = useDisclosure();
|
||||||
|
|
||||||
const Title = useCallback(({ title }: { title: string }) => {
|
const Title = useCallback(({ title }: { title: string }) => {
|
||||||
return (
|
return (
|
||||||
@ -237,67 +252,97 @@ function DataProcess() {
|
|||||||
children: chunkSettingMode === ChunkSettingModeEnum.custom && (
|
children: chunkSettingMode === ChunkSettingModeEnum.custom && (
|
||||||
<Box mt={5}>
|
<Box mt={5}>
|
||||||
<Box>
|
<Box>
|
||||||
<Flex alignItems={'center'}>
|
<RadioGroup<DataChunkSplitModeEnum>
|
||||||
<Box>{t('dataset:ideal_chunk_length')}</Box>
|
list={[
|
||||||
<QuestionTip label={t('dataset:ideal_chunk_length_tips')} />
|
{
|
||||||
</Flex>
|
title: t('dataset:split_chunk_size'),
|
||||||
<Box
|
value: DataChunkSplitModeEnum.size
|
||||||
mt={1}
|
},
|
||||||
css={{
|
{
|
||||||
'& > span': {
|
title: t('dataset:split_chunk_char'),
|
||||||
display: 'block'
|
value: DataChunkSplitModeEnum.char,
|
||||||
|
tooltip: t('dataset:custom_split_sign_tip')
|
||||||
}
|
}
|
||||||
|
]}
|
||||||
|
value={chunkSplitMode}
|
||||||
|
onChange={(e) => {
|
||||||
|
setValue('chunkSplitMode', e);
|
||||||
}}
|
}}
|
||||||
>
|
/>
|
||||||
<MyTooltip
|
|
||||||
label={t('common:core.dataset.import.Chunk Range', {
|
{chunkSplitMode === DataChunkSplitModeEnum.size && (
|
||||||
min: minChunkSize,
|
<Box
|
||||||
max: maxChunkSize
|
mt={1.5}
|
||||||
})}
|
css={{
|
||||||
|
'& > span': {
|
||||||
|
display: 'block'
|
||||||
|
}
|
||||||
|
}}
|
||||||
>
|
>
|
||||||
<MyNumberInput
|
<MyTooltip
|
||||||
register={register}
|
label={t('common:core.dataset.import.Chunk Range', {
|
||||||
name={chunkSizeField}
|
min: minChunkSize,
|
||||||
min={minChunkSize}
|
max: maxChunkSize
|
||||||
max={maxChunkSize}
|
})}
|
||||||
size={'sm'}
|
>
|
||||||
step={100}
|
<MyNumberInput
|
||||||
/>
|
register={register}
|
||||||
</MyTooltip>
|
name={chunkSizeField}
|
||||||
</Box>
|
min={minChunkSize}
|
||||||
|
max={maxChunkSize}
|
||||||
|
size={'sm'}
|
||||||
|
step={100}
|
||||||
|
/>
|
||||||
|
</MyTooltip>
|
||||||
|
</Box>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{chunkSplitMode === DataChunkSplitModeEnum.char && (
|
||||||
|
<HStack mt={1.5}>
|
||||||
|
<Box flex={'1 0 0'}>
|
||||||
|
<MySelect<string>
|
||||||
|
list={customSplitList}
|
||||||
|
size={'sm'}
|
||||||
|
bg={'myGray.50'}
|
||||||
|
value={customListSelectValue}
|
||||||
|
h={'32px'}
|
||||||
|
onChange={(val) => {
|
||||||
|
setCustomListSelectValue(val);
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
</Box>
|
||||||
|
{customListSelectValue === 'Other' && (
|
||||||
|
<Input
|
||||||
|
flex={'1 0 0'}
|
||||||
|
h={'32px'}
|
||||||
|
size={'sm'}
|
||||||
|
bg={'myGray.50'}
|
||||||
|
placeholder="\n;======;==SPLIT=="
|
||||||
|
{...register('chunkSplitter')}
|
||||||
|
/>
|
||||||
|
)}
|
||||||
|
</HStack>
|
||||||
|
)}
|
||||||
</Box>
|
</Box>
|
||||||
|
|
||||||
<Box mt={3}>
|
{trainingType === DatasetCollectionDataProcessModeEnum.chunk && (
|
||||||
<Box>
|
<Box>
|
||||||
{t('common:core.dataset.import.Custom split char')}
|
<Flex alignItems={'center'} mt={3}>
|
||||||
<QuestionTip label={t('dataset:custom_split_sign_tip')} />
|
<Box>{t('dataset:index_size')}</Box>
|
||||||
</Box>
|
<QuestionTip label={t('dataset:index_size_tips')} />
|
||||||
|
</Flex>
|
||||||
<HStack mt={1}>
|
<Box mt={1}>
|
||||||
<Box flex={'1 0 0'}>
|
<MySelect<number>
|
||||||
<MySelect<string>
|
|
||||||
list={customSplitList}
|
|
||||||
size={'sm'}
|
|
||||||
bg={'myGray.50'}
|
bg={'myGray.50'}
|
||||||
value={customListSelectValue}
|
list={indexSizeSeletorList}
|
||||||
h={'32px'}
|
value={indexSize}
|
||||||
onChange={(val) => {
|
onChange={(val) => {
|
||||||
setCustomListSelectValue(val);
|
setValue('indexSize', val);
|
||||||
}}
|
}}
|
||||||
/>
|
/>
|
||||||
</Box>
|
</Box>
|
||||||
{customListSelectValue === 'Other' && (
|
</Box>
|
||||||
<Input
|
)}
|
||||||
flex={'1 0 0'}
|
|
||||||
h={'32px'}
|
|
||||||
size={'sm'}
|
|
||||||
bg={'myGray.50'}
|
|
||||||
placeholder="\n;======;==SPLIT=="
|
|
||||||
{...register('customSplitChar')}
|
|
||||||
/>
|
|
||||||
)}
|
|
||||||
</HStack>
|
|
||||||
</Box>
|
|
||||||
|
|
||||||
{showQAPromptInput && (
|
{showQAPromptInput && (
|
||||||
<Box mt={3}>
|
<Box mt={3}>
|
||||||
|
|||||||
@ -16,6 +16,7 @@ import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContex
|
|||||||
import MyBox from '@fastgpt/web/components/common/MyBox';
|
import MyBox from '@fastgpt/web/components/common/MyBox';
|
||||||
import Markdown from '@/components/Markdown';
|
import Markdown from '@/components/Markdown';
|
||||||
import { useToast } from '@fastgpt/web/hooks/useToast';
|
import { useToast } from '@fastgpt/web/hooks/useToast';
|
||||||
|
import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
|
||||||
|
|
||||||
const PreviewData = () => {
|
const PreviewData = () => {
|
||||||
const { t } = useTranslation();
|
const { t } = useTranslation();
|
||||||
@ -23,6 +24,7 @@ const PreviewData = () => {
|
|||||||
const goToNext = useContextSelector(DatasetImportContext, (v) => v.goToNext);
|
const goToNext = useContextSelector(DatasetImportContext, (v) => v.goToNext);
|
||||||
|
|
||||||
const datasetId = useContextSelector(DatasetPageContext, (v) => v.datasetId);
|
const datasetId = useContextSelector(DatasetPageContext, (v) => v.datasetId);
|
||||||
|
const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail);
|
||||||
|
|
||||||
const sources = useContextSelector(DatasetImportContext, (v) => v.sources);
|
const sources = useContextSelector(DatasetImportContext, (v) => v.sources);
|
||||||
const importSource = useContextSelector(DatasetImportContext, (v) => v.importSource);
|
const importSource = useContextSelector(DatasetImportContext, (v) => v.importSource);
|
||||||
@ -36,12 +38,13 @@ const PreviewData = () => {
|
|||||||
async () => {
|
async () => {
|
||||||
if (!previewFile) return;
|
if (!previewFile) return;
|
||||||
if (importSource === ImportDataSourceEnum.fileCustom) {
|
if (importSource === ImportDataSourceEnum.fileCustom) {
|
||||||
const customSplitChar = processParamsForm.getValues('customSplitChar');
|
const chunkSplitter = processParamsForm.getValues('chunkSplitter');
|
||||||
const { chunks } = splitText2Chunks({
|
const { chunks } = splitText2Chunks({
|
||||||
text: previewFile.rawText || '',
|
text: previewFile.rawText || '',
|
||||||
chunkLen: chunkSize,
|
chunkSize,
|
||||||
|
maxSize: getLLMMaxChunkSize(datasetDetail.agentModel),
|
||||||
overlapRatio: chunkOverlapRatio,
|
overlapRatio: chunkOverlapRatio,
|
||||||
customReg: customSplitChar ? [customSplitChar] : []
|
customReg: chunkSplitter ? [chunkSplitter] : []
|
||||||
});
|
});
|
||||||
return chunks.map((chunk) => ({
|
return chunks.map((chunk) => ({
|
||||||
q: chunk,
|
q: chunk,
|
||||||
@ -61,9 +64,12 @@ const PreviewData = () => {
|
|||||||
|
|
||||||
customPdfParse: processParamsForm.getValues('customPdfParse'),
|
customPdfParse: processParamsForm.getValues('customPdfParse'),
|
||||||
|
|
||||||
|
trainingType: processParamsForm.getValues('trainingType'),
|
||||||
|
chunkSettingMode: processParamsForm.getValues('chunkSettingMode'),
|
||||||
|
chunkSplitMode: processParamsForm.getValues('chunkSplitMode'),
|
||||||
chunkSize,
|
chunkSize,
|
||||||
|
chunkSplitter: processParamsForm.getValues('chunkSplitter'),
|
||||||
overlapRatio: chunkOverlapRatio,
|
overlapRatio: chunkOverlapRatio,
|
||||||
customSplitChar: processParamsForm.getValues('customSplitChar'),
|
|
||||||
|
|
||||||
selector: processParamsForm.getValues('webSelector'),
|
selector: processParamsForm.getValues('webSelector'),
|
||||||
isQAImport: importSource === ImportDataSourceEnum.csvTable,
|
isQAImport: importSource === ImportDataSourceEnum.csvTable,
|
||||||
|
|||||||
@ -49,7 +49,7 @@ const Upload = () => {
|
|||||||
const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail);
|
const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail);
|
||||||
const retrainNewCollectionId = useRef('');
|
const retrainNewCollectionId = useRef('');
|
||||||
|
|
||||||
const { importSource, parentId, sources, setSources, processParamsForm, chunkSize } =
|
const { importSource, parentId, sources, setSources, processParamsForm, chunkSize, indexSize } =
|
||||||
useContextSelector(DatasetImportContext, (v) => v);
|
useContextSelector(DatasetImportContext, (v) => v);
|
||||||
|
|
||||||
const { handleSubmit } = processParamsForm;
|
const { handleSubmit } = processParamsForm;
|
||||||
@ -81,7 +81,7 @@ const Upload = () => {
|
|||||||
}, [waitingFilesCount, totalFilesCount, allFinished, t]);
|
}, [waitingFilesCount, totalFilesCount, allFinished, t]);
|
||||||
|
|
||||||
const { runAsync: startUpload, loading: isLoading } = useRequest2(
|
const { runAsync: startUpload, loading: isLoading } = useRequest2(
|
||||||
async ({ trainingType, customSplitChar, qaPrompt, webSelector }: ImportFormType) => {
|
async ({ trainingType, chunkSplitter, qaPrompt, webSelector }: ImportFormType) => {
|
||||||
if (sources.length === 0) return;
|
if (sources.length === 0) return;
|
||||||
const filterWaitingSources = sources.filter((item) => item.createStatus === 'waiting');
|
const filterWaitingSources = sources.filter((item) => item.createStatus === 'waiting');
|
||||||
|
|
||||||
@ -111,10 +111,16 @@ const Upload = () => {
|
|||||||
trainingType,
|
trainingType,
|
||||||
imageIndex: processParamsForm.getValues('imageIndex'),
|
imageIndex: processParamsForm.getValues('imageIndex'),
|
||||||
autoIndexes: processParamsForm.getValues('autoIndexes'),
|
autoIndexes: processParamsForm.getValues('autoIndexes'),
|
||||||
|
|
||||||
|
chunkSettingMode: processParamsForm.getValues('chunkSettingMode'),
|
||||||
|
chunkSplitMode: processParamsForm.getValues('chunkSplitMode'),
|
||||||
|
|
||||||
chunkSize,
|
chunkSize,
|
||||||
chunkSplitter: customSplitChar,
|
indexSize,
|
||||||
|
chunkSplitter,
|
||||||
qaPrompt: trainingType === DatasetCollectionDataProcessModeEnum.qa ? qaPrompt : undefined
|
qaPrompt: trainingType === DatasetCollectionDataProcessModeEnum.qa ? qaPrompt : undefined
|
||||||
};
|
};
|
||||||
|
|
||||||
if (importSource === ImportDataSourceEnum.reTraining) {
|
if (importSource === ImportDataSourceEnum.reTraining) {
|
||||||
const res = await postReTrainingDatasetFileCollection({
|
const res = await postReTrainingDatasetFileCollection({
|
||||||
...commonParams,
|
...commonParams,
|
||||||
|
|||||||
@ -1,102 +0,0 @@
|
|||||||
import React from 'react';
|
|
||||||
import { Box } from '@chakra-ui/react';
|
|
||||||
import { ImportSourceItemType } from '@/web/core/dataset/type';
|
|
||||||
import MyRightDrawer from '@fastgpt/web/components/common/MyDrawer/MyRightDrawer';
|
|
||||||
import { getPreviewChunks } from '@/web/core/dataset/api';
|
|
||||||
import { ImportDataSourceEnum } from '@fastgpt/global/core/dataset/constants';
|
|
||||||
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
|
||||||
import { useContextSelector } from 'use-context-selector';
|
|
||||||
import { DatasetImportContext } from '../Context';
|
|
||||||
import { useRequest2 } from '@fastgpt/web/hooks/useRequest';
|
|
||||||
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
|
|
||||||
import { getPreviewSourceReadType } from '../utils';
|
|
||||||
|
|
||||||
const PreviewChunks = ({
|
|
||||||
previewSource,
|
|
||||||
onClose
|
|
||||||
}: {
|
|
||||||
previewSource: ImportSourceItemType;
|
|
||||||
onClose: () => void;
|
|
||||||
}) => {
|
|
||||||
const { importSource, chunkSize, chunkOverlapRatio, processParamsForm } = useContextSelector(
|
|
||||||
DatasetImportContext,
|
|
||||||
(v) => v
|
|
||||||
);
|
|
||||||
const datasetId = useContextSelector(DatasetPageContext, (v) => v.datasetId);
|
|
||||||
|
|
||||||
const { data = [], loading: isLoading } = useRequest2(
|
|
||||||
async () => {
|
|
||||||
if (importSource === ImportDataSourceEnum.fileCustom) {
|
|
||||||
const customSplitChar = processParamsForm.getValues('customSplitChar');
|
|
||||||
const { chunks } = splitText2Chunks({
|
|
||||||
text: previewSource.rawText || '',
|
|
||||||
chunkLen: chunkSize,
|
|
||||||
overlapRatio: chunkOverlapRatio,
|
|
||||||
customReg: customSplitChar ? [customSplitChar] : []
|
|
||||||
});
|
|
||||||
return chunks.map((chunk) => ({
|
|
||||||
q: chunk,
|
|
||||||
a: ''
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
|
|
||||||
return getPreviewChunks({
|
|
||||||
datasetId,
|
|
||||||
type: getPreviewSourceReadType(previewSource),
|
|
||||||
sourceId:
|
|
||||||
previewSource.dbFileId ||
|
|
||||||
previewSource.link ||
|
|
||||||
previewSource.externalFileUrl ||
|
|
||||||
previewSource.apiFileId ||
|
|
||||||
'',
|
|
||||||
|
|
||||||
chunkSize,
|
|
||||||
overlapRatio: chunkOverlapRatio,
|
|
||||||
customSplitChar: processParamsForm.getValues('customSplitChar'),
|
|
||||||
|
|
||||||
selector: processParamsForm.getValues('webSelector'),
|
|
||||||
isQAImport: importSource === ImportDataSourceEnum.csvTable,
|
|
||||||
externalFileId: previewSource.externalFileId
|
|
||||||
});
|
|
||||||
},
|
|
||||||
{
|
|
||||||
manual: false
|
|
||||||
}
|
|
||||||
);
|
|
||||||
|
|
||||||
return (
|
|
||||||
<MyRightDrawer
|
|
||||||
onClose={onClose}
|
|
||||||
iconSrc={previewSource.icon}
|
|
||||||
title={previewSource.sourceName}
|
|
||||||
isLoading={isLoading}
|
|
||||||
maxW={['90vw', '40vw']}
|
|
||||||
px={0}
|
|
||||||
>
|
|
||||||
<Box overflowY={'auto'} px={5} fontSize={'sm'}>
|
|
||||||
{data.map((item, index) => (
|
|
||||||
<Box
|
|
||||||
key={index}
|
|
||||||
whiteSpace={'pre-wrap'}
|
|
||||||
fontSize={'sm'}
|
|
||||||
p={4}
|
|
||||||
bg={index % 2 === 0 ? 'white' : 'myWhite.600'}
|
|
||||||
mb={3}
|
|
||||||
borderRadius={'md'}
|
|
||||||
borderWidth={'1px'}
|
|
||||||
borderColor={'borderColor.low'}
|
|
||||||
boxShadow={'2'}
|
|
||||||
_notLast={{
|
|
||||||
mb: 2
|
|
||||||
}}
|
|
||||||
>
|
|
||||||
<Box color={'myGray.900'}>{item.q}</Box>
|
|
||||||
<Box color={'myGray.500'}>{item.a}</Box>
|
|
||||||
</Box>
|
|
||||||
))}
|
|
||||||
</Box>
|
|
||||||
</MyRightDrawer>
|
|
||||||
);
|
|
||||||
};
|
|
||||||
|
|
||||||
export default React.memo(PreviewChunks);
|
|
||||||
@ -8,10 +8,11 @@ import { useRouter } from 'next/router';
|
|||||||
import { useRequest2 } from '@fastgpt/web/hooks/useRequest';
|
import { useRequest2 } from '@fastgpt/web/hooks/useRequest';
|
||||||
import { getDatasetCollectionById } from '@/web/core/dataset/api';
|
import { getDatasetCollectionById } from '@/web/core/dataset/api';
|
||||||
import MyBox from '@fastgpt/web/components/common/MyBox';
|
import MyBox from '@fastgpt/web/components/common/MyBox';
|
||||||
import { ChunkSettingModeEnum } from '@/web/core/dataset/constants';
|
import { ChunkSettingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||||
import { getCollectionIcon } from '@fastgpt/global/core/dataset/utils';
|
import { getCollectionIcon } from '@fastgpt/global/core/dataset/utils';
|
||||||
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
|
|
||||||
import { Box } from '@chakra-ui/react';
|
import { Box } from '@chakra-ui/react';
|
||||||
|
import { DataChunkSplitModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||||
|
import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent';
|
||||||
|
|
||||||
const Upload = dynamic(() => import('../commonProgress/Upload'));
|
const Upload = dynamic(() => import('../commonProgress/Upload'));
|
||||||
const PreviewData = dynamic(() => import('../commonProgress/PreviewData'));
|
const PreviewData = dynamic(() => import('../commonProgress/PreviewData'));
|
||||||
@ -23,7 +24,6 @@ const ReTraining = () => {
|
|||||||
collectionId: string;
|
collectionId: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail);
|
|
||||||
const activeStep = useContextSelector(DatasetImportContext, (v) => v.activeStep);
|
const activeStep = useContextSelector(DatasetImportContext, (v) => v.activeStep);
|
||||||
const setSources = useContextSelector(DatasetImportContext, (v) => v.setSources);
|
const setSources = useContextSelector(DatasetImportContext, (v) => v.setSources);
|
||||||
const processParamsForm = useContextSelector(DatasetImportContext, (v) => v.processParamsForm);
|
const processParamsForm = useContextSelector(DatasetImportContext, (v) => v.processParamsForm);
|
||||||
@ -46,18 +46,21 @@ const ReTraining = () => {
|
|||||||
uploadedFileRate: 100
|
uploadedFileRate: 100
|
||||||
}
|
}
|
||||||
]);
|
]);
|
||||||
|
|
||||||
processParamsForm.reset({
|
processParamsForm.reset({
|
||||||
customPdfParse: collection.customPdfParse,
|
customPdfParse: collection.customPdfParse,
|
||||||
trainingType: collection.trainingType,
|
trainingType: collection.trainingType,
|
||||||
imageIndex: collection.imageIndex,
|
imageIndex: collection.imageIndex,
|
||||||
autoIndexes: collection.autoIndexes,
|
autoIndexes: collection.autoIndexes,
|
||||||
|
|
||||||
chunkSettingMode: ChunkSettingModeEnum.auto,
|
chunkSettingMode: collection.chunkSettingMode || ChunkSettingModeEnum.auto,
|
||||||
|
chunkSplitMode: collection.chunkSplitMode || DataChunkSplitModeEnum.size,
|
||||||
embeddingChunkSize: collection.chunkSize,
|
embeddingChunkSize: collection.chunkSize,
|
||||||
qaChunkSize: collection.chunkSize,
|
qaChunkSize: collection.chunkSize,
|
||||||
customSplitChar: collection.chunkSplitter,
|
indexSize: collection.indexSize || 512,
|
||||||
qaPrompt: collection.qaPrompt,
|
chunkSplitter: collection.chunkSplitter,
|
||||||
webSelector: collection.metadata?.webPageSelector
|
webSelector: collection.metadata?.webPageSelector,
|
||||||
|
qaPrompt: collection.qaPrompt || Prompt_AgentQA.description
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|||||||
@ -294,7 +294,7 @@ const MyInfo = ({ onOpenContact }: { onOpenContact: () => void }) => {
|
|||||||
title={t('account_info:click_modify_nickname')}
|
title={t('account_info:click_modify_nickname')}
|
||||||
borderColor={'transparent'}
|
borderColor={'transparent'}
|
||||||
transform={'translateX(-11px)'}
|
transform={'translateX(-11px)'}
|
||||||
maxLength={20}
|
maxLength={100}
|
||||||
onBlur={async (e) => {
|
onBlur={async (e) => {
|
||||||
const val = e.target.value;
|
const val = e.target.value;
|
||||||
if (val === userInfo?.team?.memberName) return;
|
if (val === userInfo?.team?.memberName) return;
|
||||||
|
|||||||
@ -2,8 +2,7 @@ import { reTrainingDatasetFileCollectionParams } from '@fastgpt/global/core/data
|
|||||||
import { createCollectionAndInsertData } from '@fastgpt/service/core/dataset/collection/controller';
|
import { createCollectionAndInsertData } from '@fastgpt/service/core/dataset/collection/controller';
|
||||||
import {
|
import {
|
||||||
DatasetCollectionTypeEnum,
|
DatasetCollectionTypeEnum,
|
||||||
DatasetSourceReadTypeEnum,
|
DatasetSourceReadTypeEnum
|
||||||
TrainingModeEnum
|
|
||||||
} from '@fastgpt/global/core/dataset/constants';
|
} from '@fastgpt/global/core/dataset/constants';
|
||||||
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
|
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
|
||||||
import { hashStr } from '@fastgpt/global/common/string/tools';
|
import { hashStr } from '@fastgpt/global/common/string/tools';
|
||||||
|
|||||||
@ -4,7 +4,7 @@
|
|||||||
*/
|
*/
|
||||||
import type { NextApiRequest } from 'next';
|
import type { NextApiRequest } from 'next';
|
||||||
import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken/index';
|
import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken/index';
|
||||||
import { getEmbeddingModel } from '@fastgpt/service/core/ai/model';
|
import { getEmbeddingModel, getLLMModel } from '@fastgpt/service/core/ai/model';
|
||||||
import { hasSameValue } from '@/service/core/dataset/data/utils';
|
import { hasSameValue } from '@/service/core/dataset/data/utils';
|
||||||
import { insertData2Dataset } from '@/service/core/dataset/data/controller';
|
import { insertData2Dataset } from '@/service/core/dataset/data/controller';
|
||||||
import { authDatasetCollection } from '@fastgpt/service/support/permission/dataset/auth';
|
import { authDatasetCollection } from '@fastgpt/service/support/permission/dataset/auth';
|
||||||
@ -16,6 +16,7 @@ import { checkDatasetLimit } from '@fastgpt/service/support/permission/teamLimit
|
|||||||
import { NextAPI } from '@/service/middleware/entry';
|
import { NextAPI } from '@/service/middleware/entry';
|
||||||
import { WritePermissionVal } from '@fastgpt/global/support/permission/constant';
|
import { WritePermissionVal } from '@fastgpt/global/support/permission/constant';
|
||||||
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
|
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
|
||||||
|
import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
|
||||||
|
|
||||||
async function handler(req: NextApiRequest) {
|
async function handler(req: NextApiRequest) {
|
||||||
const { collectionId, q, a, indexes } = req.body as InsertOneDatasetDataProps;
|
const { collectionId, q, a, indexes } = req.body as InsertOneDatasetDataProps;
|
||||||
@ -45,7 +46,7 @@ async function handler(req: NextApiRequest) {
|
|||||||
// auth collection and get dataset
|
// auth collection and get dataset
|
||||||
const [
|
const [
|
||||||
{
|
{
|
||||||
dataset: { _id: datasetId, vectorModel }
|
dataset: { _id: datasetId, vectorModel, agentModel }
|
||||||
}
|
}
|
||||||
] = await Promise.all([getCollectionWithDataset(collectionId)]);
|
] = await Promise.all([getCollectionWithDataset(collectionId)]);
|
||||||
|
|
||||||
@ -60,9 +61,11 @@ async function handler(req: NextApiRequest) {
|
|||||||
// token check
|
// token check
|
||||||
const token = await countPromptTokens(formatQ + formatA, '');
|
const token = await countPromptTokens(formatQ + formatA, '');
|
||||||
const vectorModelData = getEmbeddingModel(vectorModel);
|
const vectorModelData = getEmbeddingModel(vectorModel);
|
||||||
|
const llmModelData = getLLMModel(agentModel);
|
||||||
|
const maxChunkSize = getLLMMaxChunkSize(llmModelData);
|
||||||
|
|
||||||
if (token > vectorModelData.maxToken) {
|
if (token > maxChunkSize) {
|
||||||
return Promise.reject('Q Over Tokens');
|
return Promise.reject(`Content over max chunk size: ${maxChunkSize}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Duplicate data check
|
// Duplicate data check
|
||||||
@ -82,7 +85,7 @@ async function handler(req: NextApiRequest) {
|
|||||||
q: formatQ,
|
q: formatQ,
|
||||||
a: formatA,
|
a: formatA,
|
||||||
chunkIndex: 0,
|
chunkIndex: 0,
|
||||||
model: vectorModelData.model,
|
embeddingModel: vectorModelData.model,
|
||||||
indexes: formatIndexes
|
indexes: formatIndexes
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@ -1,4 +1,9 @@
|
|||||||
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
|
import {
|
||||||
|
ChunkSettingModeEnum,
|
||||||
|
DataChunkSplitModeEnum,
|
||||||
|
DatasetCollectionDataProcessModeEnum,
|
||||||
|
DatasetSourceReadTypeEnum
|
||||||
|
} from '@fastgpt/global/core/dataset/constants';
|
||||||
import { rawText2Chunks, readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read';
|
import { rawText2Chunks, readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read';
|
||||||
import { NextAPI } from '@/service/middleware/entry';
|
import { NextAPI } from '@/service/middleware/entry';
|
||||||
import { ApiRequestProps } from '@fastgpt/service/type/next';
|
import { ApiRequestProps } from '@fastgpt/service/type/next';
|
||||||
@ -8,17 +13,30 @@ import {
|
|||||||
} from '@fastgpt/global/support/permission/constant';
|
} from '@fastgpt/global/support/permission/constant';
|
||||||
import { authCollectionFile } from '@fastgpt/service/support/permission/auth/file';
|
import { authCollectionFile } from '@fastgpt/service/support/permission/auth/file';
|
||||||
import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
|
import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
|
||||||
|
import {
|
||||||
|
computeChunkSize,
|
||||||
|
computeChunkSplitter,
|
||||||
|
getLLMMaxChunkSize
|
||||||
|
} from '@fastgpt/global/core/dataset/training/utils';
|
||||||
|
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
|
||||||
|
import { getLLMModel } from '@fastgpt/service/core/ai/model';
|
||||||
|
|
||||||
export type PostPreviewFilesChunksProps = {
|
export type PostPreviewFilesChunksProps = {
|
||||||
datasetId: string;
|
datasetId: string;
|
||||||
type: DatasetSourceReadTypeEnum;
|
type: DatasetSourceReadTypeEnum;
|
||||||
sourceId: string;
|
sourceId: string;
|
||||||
|
|
||||||
chunkSize: number;
|
|
||||||
overlapRatio: number;
|
|
||||||
customSplitChar?: string;
|
|
||||||
customPdfParse?: boolean;
|
customPdfParse?: boolean;
|
||||||
|
|
||||||
|
trainingType: DatasetCollectionDataProcessModeEnum;
|
||||||
|
|
||||||
|
// Chunk settings
|
||||||
|
chunkSettingMode: ChunkSettingModeEnum;
|
||||||
|
chunkSplitMode: DataChunkSplitModeEnum;
|
||||||
|
chunkSize: number;
|
||||||
|
chunkSplitter?: string;
|
||||||
|
overlapRatio: number;
|
||||||
|
|
||||||
// Read params
|
// Read params
|
||||||
selector?: string;
|
selector?: string;
|
||||||
isQAImport?: boolean;
|
isQAImport?: boolean;
|
||||||
@ -32,55 +50,64 @@ export type PreviewChunksResponse = {
|
|||||||
async function handler(
|
async function handler(
|
||||||
req: ApiRequestProps<PostPreviewFilesChunksProps>
|
req: ApiRequestProps<PostPreviewFilesChunksProps>
|
||||||
): Promise<PreviewChunksResponse> {
|
): Promise<PreviewChunksResponse> {
|
||||||
const {
|
let {
|
||||||
type,
|
type,
|
||||||
sourceId,
|
sourceId,
|
||||||
|
customPdfParse = false,
|
||||||
|
|
||||||
|
trainingType,
|
||||||
|
chunkSettingMode,
|
||||||
|
chunkSplitMode,
|
||||||
chunkSize,
|
chunkSize,
|
||||||
customSplitChar,
|
chunkSplitter,
|
||||||
|
|
||||||
overlapRatio,
|
overlapRatio,
|
||||||
selector,
|
selector,
|
||||||
isQAImport,
|
isQAImport,
|
||||||
datasetId,
|
datasetId,
|
||||||
externalFileId,
|
externalFileId
|
||||||
customPdfParse = false
|
|
||||||
} = req.body;
|
} = req.body;
|
||||||
|
|
||||||
if (!sourceId) {
|
if (!sourceId) {
|
||||||
throw new Error('sourceId is empty');
|
throw new Error('sourceId is empty');
|
||||||
}
|
}
|
||||||
if (chunkSize > 30000) {
|
|
||||||
throw new Error('chunkSize is too large, should be less than 30000');
|
const fileAuthRes =
|
||||||
|
type === DatasetSourceReadTypeEnum.fileLocal
|
||||||
|
? await authCollectionFile({
|
||||||
|
req,
|
||||||
|
authToken: true,
|
||||||
|
authApiKey: true,
|
||||||
|
fileId: sourceId,
|
||||||
|
per: OwnerPermissionVal
|
||||||
|
})
|
||||||
|
: undefined;
|
||||||
|
|
||||||
|
const { dataset, teamId, tmbId } = await authDataset({
|
||||||
|
req,
|
||||||
|
authApiKey: true,
|
||||||
|
authToken: true,
|
||||||
|
datasetId,
|
||||||
|
per: WritePermissionVal
|
||||||
|
});
|
||||||
|
|
||||||
|
if (fileAuthRes && (String(fileAuthRes.tmbId) !== String(tmbId) || !fileAuthRes.isRoot)) {
|
||||||
|
return Promise.reject(CommonErrEnum.unAuthFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
const { teamId, tmbId, apiServer, feishuServer, yuqueServer } = await (async () => {
|
chunkSize = computeChunkSize({
|
||||||
if (type === DatasetSourceReadTypeEnum.fileLocal) {
|
trainingType,
|
||||||
const res = await authCollectionFile({
|
chunkSettingMode,
|
||||||
req,
|
chunkSplitMode,
|
||||||
authToken: true,
|
chunkSize,
|
||||||
authApiKey: true,
|
llmModel: getLLMModel(dataset.agentModel)
|
||||||
fileId: sourceId,
|
});
|
||||||
per: OwnerPermissionVal
|
|
||||||
});
|
chunkSplitter = computeChunkSplitter({
|
||||||
return {
|
chunkSettingMode,
|
||||||
teamId: res.teamId,
|
chunkSplitMode,
|
||||||
tmbId: res.tmbId
|
chunkSplitter
|
||||||
};
|
});
|
||||||
}
|
|
||||||
const { dataset, teamId, tmbId } = await authDataset({
|
|
||||||
req,
|
|
||||||
authApiKey: true,
|
|
||||||
authToken: true,
|
|
||||||
datasetId,
|
|
||||||
per: WritePermissionVal
|
|
||||||
});
|
|
||||||
return {
|
|
||||||
teamId,
|
|
||||||
tmbId,
|
|
||||||
apiServer: dataset.apiServer,
|
|
||||||
feishuServer: dataset.feishuServer,
|
|
||||||
yuqueServer: dataset.yuqueServer
|
|
||||||
};
|
|
||||||
})();
|
|
||||||
|
|
||||||
const { rawText } = await readDatasetSourceRawText({
|
const { rawText } = await readDatasetSourceRawText({
|
||||||
teamId,
|
teamId,
|
||||||
@ -89,18 +116,19 @@ async function handler(
|
|||||||
sourceId,
|
sourceId,
|
||||||
selector,
|
selector,
|
||||||
isQAImport,
|
isQAImport,
|
||||||
apiServer,
|
apiServer: dataset.apiServer,
|
||||||
feishuServer,
|
feishuServer: dataset.feishuServer,
|
||||||
yuqueServer,
|
yuqueServer: dataset.yuqueServer,
|
||||||
externalFileId,
|
externalFileId,
|
||||||
customPdfParse
|
customPdfParse
|
||||||
});
|
});
|
||||||
|
|
||||||
return rawText2Chunks({
|
return rawText2Chunks({
|
||||||
rawText,
|
rawText,
|
||||||
chunkLen: chunkSize,
|
chunkSize,
|
||||||
|
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
|
||||||
overlapRatio,
|
overlapRatio,
|
||||||
customReg: customSplitChar ? [customSplitChar] : [],
|
customReg: chunkSplitter ? [chunkSplitter] : [],
|
||||||
isQAImport: isQAImport
|
isQAImport: isQAImport
|
||||||
}).slice(0, 10);
|
}).slice(0, 10);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -5,25 +5,63 @@ import {
|
|||||||
UpdateDatasetDataProps
|
UpdateDatasetDataProps
|
||||||
} from '@fastgpt/global/core/dataset/controller';
|
} from '@fastgpt/global/core/dataset/controller';
|
||||||
import { insertDatasetDataVector } from '@fastgpt/service/common/vectorStore/controller';
|
import { insertDatasetDataVector } from '@fastgpt/service/common/vectorStore/controller';
|
||||||
import { getDefaultIndex } from '@fastgpt/global/core/dataset/utils';
|
|
||||||
import { jiebaSplit } from '@fastgpt/service/common/string/jieba/index';
|
import { jiebaSplit } from '@fastgpt/service/common/string/jieba/index';
|
||||||
import { deleteDatasetDataVector } from '@fastgpt/service/common/vectorStore/controller';
|
import { deleteDatasetDataVector } from '@fastgpt/service/common/vectorStore/controller';
|
||||||
import { DatasetDataIndexItemType, DatasetDataItemType } from '@fastgpt/global/core/dataset/type';
|
import { DatasetDataIndexItemType, DatasetDataItemType } from '@fastgpt/global/core/dataset/type';
|
||||||
import { getEmbeddingModel } from '@fastgpt/service/core/ai/model';
|
import { getEmbeddingModel, getLLMModel } from '@fastgpt/service/core/ai/model';
|
||||||
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
|
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
|
||||||
import { ClientSession } from '@fastgpt/service/common/mongo';
|
import { ClientSession } from '@fastgpt/service/common/mongo';
|
||||||
import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTextSchema';
|
import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTextSchema';
|
||||||
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
|
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
|
||||||
|
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
|
||||||
|
import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken';
|
||||||
|
import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
|
||||||
|
|
||||||
const formatIndexes = ({
|
const formatIndexes = async ({
|
||||||
indexes,
|
indexes,
|
||||||
q,
|
q,
|
||||||
a = ''
|
a = '',
|
||||||
|
indexSize
|
||||||
}: {
|
}: {
|
||||||
indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & { dataId?: string })[];
|
indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & { dataId?: string })[];
|
||||||
q: string;
|
q: string;
|
||||||
a?: string;
|
a?: string;
|
||||||
}) => {
|
indexSize: number;
|
||||||
|
}): Promise<
|
||||||
|
{
|
||||||
|
type: `${DatasetDataIndexTypeEnum}`;
|
||||||
|
text: string;
|
||||||
|
dataId?: string;
|
||||||
|
}[]
|
||||||
|
> => {
|
||||||
|
/* get dataset data default index */
|
||||||
|
const getDefaultIndex = ({
|
||||||
|
q = '',
|
||||||
|
a,
|
||||||
|
indexSize
|
||||||
|
}: {
|
||||||
|
q?: string;
|
||||||
|
a?: string;
|
||||||
|
indexSize: number;
|
||||||
|
}) => {
|
||||||
|
const qChunks = splitText2Chunks({
|
||||||
|
text: q,
|
||||||
|
chunkSize: indexSize
|
||||||
|
}).chunks;
|
||||||
|
const aChunks = a ? splitText2Chunks({ text: a, chunkSize: indexSize }).chunks : [];
|
||||||
|
|
||||||
|
return [
|
||||||
|
...qChunks.map((text) => ({
|
||||||
|
text,
|
||||||
|
type: DatasetDataIndexTypeEnum.default
|
||||||
|
})),
|
||||||
|
...aChunks.map((text) => ({
|
||||||
|
text,
|
||||||
|
type: DatasetDataIndexTypeEnum.default
|
||||||
|
}))
|
||||||
|
];
|
||||||
|
};
|
||||||
|
|
||||||
indexes = indexes || [];
|
indexes = indexes || [];
|
||||||
// If index not type, set it to custom
|
// If index not type, set it to custom
|
||||||
indexes = indexes
|
indexes = indexes
|
||||||
@ -35,7 +73,7 @@ const formatIndexes = ({
|
|||||||
.filter((item) => !!item.text.trim());
|
.filter((item) => !!item.text.trim());
|
||||||
|
|
||||||
// Recompute default indexes, Merge ids of the same index, reduce the number of rebuilds
|
// Recompute default indexes, Merge ids of the same index, reduce the number of rebuilds
|
||||||
const defaultIndexes = getDefaultIndex({ q, a });
|
const defaultIndexes = getDefaultIndex({ q, a, indexSize });
|
||||||
const concatDefaultIndexes = defaultIndexes.map((item) => {
|
const concatDefaultIndexes = defaultIndexes.map((item) => {
|
||||||
const oldIndex = indexes!.find((index) => index.text === item.text);
|
const oldIndex = indexes!.find((index) => index.text === item.text);
|
||||||
if (oldIndex) {
|
if (oldIndex) {
|
||||||
@ -56,11 +94,24 @@ const formatIndexes = ({
|
|||||||
(item, index, self) => index === self.findIndex((t) => t.text === item.text)
|
(item, index, self) => index === self.findIndex((t) => t.text === item.text)
|
||||||
);
|
);
|
||||||
|
|
||||||
return indexes.map((index) => ({
|
const chekcIndexes = (
|
||||||
type: index.type,
|
await Promise.all(
|
||||||
text: index.text,
|
indexes.map(async (item) => {
|
||||||
dataId: index.dataId
|
// If oversize tokens, split it
|
||||||
}));
|
const tokens = await countPromptTokens(item.text);
|
||||||
|
if (tokens > indexSize) {
|
||||||
|
const splitText = splitText2Chunks({ text: item.text, chunkSize: 512 }).chunks;
|
||||||
|
return splitText.map((text) => ({
|
||||||
|
text,
|
||||||
|
type: item.type
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
return item;
|
||||||
|
})
|
||||||
|
)
|
||||||
|
).flat();
|
||||||
|
|
||||||
|
return chekcIndexes;
|
||||||
};
|
};
|
||||||
/* insert data.
|
/* insert data.
|
||||||
* 1. create data id
|
* 1. create data id
|
||||||
@ -75,30 +126,40 @@ export async function insertData2Dataset({
|
|||||||
q,
|
q,
|
||||||
a = '',
|
a = '',
|
||||||
chunkIndex = 0,
|
chunkIndex = 0,
|
||||||
|
indexSize = 512,
|
||||||
indexes,
|
indexes,
|
||||||
model,
|
embeddingModel,
|
||||||
session
|
session
|
||||||
}: CreateDatasetDataProps & {
|
}: CreateDatasetDataProps & {
|
||||||
model: string;
|
embeddingModel: string;
|
||||||
|
indexSize?: number;
|
||||||
session?: ClientSession;
|
session?: ClientSession;
|
||||||
}) {
|
}) {
|
||||||
if (!q || !datasetId || !collectionId || !model) {
|
if (!q || !datasetId || !collectionId || !embeddingModel) {
|
||||||
return Promise.reject('q, datasetId, collectionId, model is required');
|
return Promise.reject('q, datasetId, collectionId, embeddingModel is required');
|
||||||
}
|
}
|
||||||
if (String(teamId) === String(tmbId)) {
|
if (String(teamId) === String(tmbId)) {
|
||||||
return Promise.reject("teamId and tmbId can't be the same");
|
return Promise.reject("teamId and tmbId can't be the same");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const embModel = getEmbeddingModel(embeddingModel);
|
||||||
|
indexSize = Math.min(embModel.maxToken, indexSize);
|
||||||
|
|
||||||
// 1. Get vector indexes and insert
|
// 1. Get vector indexes and insert
|
||||||
// Empty indexes check, if empty, create default index
|
// Empty indexes check, if empty, create default index
|
||||||
const newIndexes = formatIndexes({ indexes, q, a });
|
const newIndexes = await formatIndexes({
|
||||||
|
indexes,
|
||||||
|
q,
|
||||||
|
a,
|
||||||
|
indexSize
|
||||||
|
});
|
||||||
|
|
||||||
// insert to vector store
|
// insert to vector store
|
||||||
const result = await Promise.all(
|
const result = await Promise.all(
|
||||||
newIndexes.map(async (item) => {
|
newIndexes.map(async (item) => {
|
||||||
const result = await insertDatasetDataVector({
|
const result = await insertDatasetDataVector({
|
||||||
query: item.text,
|
query: item.text,
|
||||||
model: getEmbeddingModel(model),
|
model: embModel,
|
||||||
teamId,
|
teamId,
|
||||||
datasetId,
|
datasetId,
|
||||||
collectionId
|
collectionId
|
||||||
@ -163,8 +224,9 @@ export async function updateData2Dataset({
|
|||||||
q = '',
|
q = '',
|
||||||
a,
|
a,
|
||||||
indexes,
|
indexes,
|
||||||
model
|
model,
|
||||||
}: UpdateDatasetDataProps & { model: string }) {
|
indexSize = 512
|
||||||
|
}: UpdateDatasetDataProps & { model: string; indexSize?: number }) {
|
||||||
if (!Array.isArray(indexes)) {
|
if (!Array.isArray(indexes)) {
|
||||||
return Promise.reject('indexes is required');
|
return Promise.reject('indexes is required');
|
||||||
}
|
}
|
||||||
@ -174,7 +236,7 @@ export async function updateData2Dataset({
|
|||||||
if (!mongoData) return Promise.reject('core.dataset.error.Data not found');
|
if (!mongoData) return Promise.reject('core.dataset.error.Data not found');
|
||||||
|
|
||||||
// 2. Compute indexes
|
// 2. Compute indexes
|
||||||
const formatIndexesResult = formatIndexes({ indexes, q, a });
|
const formatIndexesResult = await formatIndexes({ indexes, q, a, indexSize });
|
||||||
|
|
||||||
// 3. Patch indexes, create, update, delete
|
// 3. Patch indexes, create, update, delete
|
||||||
const patchResult: PatchIndexesProps[] = [];
|
const patchResult: PatchIndexesProps[] = [];
|
||||||
|
|||||||
@ -21,6 +21,11 @@ import {
|
|||||||
llmCompletionsBodyFormat,
|
llmCompletionsBodyFormat,
|
||||||
llmStreamResponseToAnswerText
|
llmStreamResponseToAnswerText
|
||||||
} from '@fastgpt/service/core/ai/utils';
|
} from '@fastgpt/service/core/ai/utils';
|
||||||
|
import { LLMModelItemType } from '@fastgpt/global/core/ai/model.d';
|
||||||
|
import {
|
||||||
|
chunkAutoChunkSize,
|
||||||
|
getLLMMaxChunkSize
|
||||||
|
} from '@fastgpt/global/core/dataset/training/utils';
|
||||||
|
|
||||||
const reduceQueue = () => {
|
const reduceQueue = () => {
|
||||||
global.qaQueueLen = global.qaQueueLen > 0 ? global.qaQueueLen - 1 : 0;
|
global.qaQueueLen = global.qaQueueLen > 0 ? global.qaQueueLen - 1 : 0;
|
||||||
@ -129,7 +134,7 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
|
|||||||
});
|
});
|
||||||
const answer = await llmStreamResponseToAnswerText(chatResponse);
|
const answer = await llmStreamResponseToAnswerText(chatResponse);
|
||||||
|
|
||||||
const qaArr = formatSplitText(answer, text); // 格式化后的QA对
|
const qaArr = formatSplitText({ answer, rawText: text, llmModel: modelData }); // 格式化后的QA对
|
||||||
|
|
||||||
addLog.info(`[QA Queue] Finish`, {
|
addLog.info(`[QA Queue] Finish`, {
|
||||||
time: Date.now() - startTime,
|
time: Date.now() - startTime,
|
||||||
@ -180,10 +185,18 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Format qa answer
|
// Format qa answer
|
||||||
function formatSplitText(text: string, rawText: string) {
|
function formatSplitText({
|
||||||
text = text.replace(/\\n/g, '\n'); // 将换行符替换为空格
|
answer,
|
||||||
|
rawText,
|
||||||
|
llmModel
|
||||||
|
}: {
|
||||||
|
answer: string;
|
||||||
|
rawText: string;
|
||||||
|
llmModel: LLMModelItemType;
|
||||||
|
}) {
|
||||||
|
answer = answer.replace(/\\n/g, '\n'); // 将换行符替换为空格
|
||||||
const regex = /Q\d+:(\s*)(.*)(\s*)A\d+:(\s*)([\s\S]*?)(?=Q\d|$)/g; // 匹配Q和A的正则表达式
|
const regex = /Q\d+:(\s*)(.*)(\s*)A\d+:(\s*)([\s\S]*?)(?=Q\d|$)/g; // 匹配Q和A的正则表达式
|
||||||
const matches = text.matchAll(regex); // 获取所有匹配到的结果
|
const matches = answer.matchAll(regex); // 获取所有匹配到的结果
|
||||||
|
|
||||||
const result: PushDatasetDataChunkProps[] = []; // 存储最终的结果
|
const result: PushDatasetDataChunkProps[] = []; // 存储最终的结果
|
||||||
for (const match of matches) {
|
for (const match of matches) {
|
||||||
@ -199,7 +212,11 @@ function formatSplitText(text: string, rawText: string) {
|
|||||||
|
|
||||||
// empty result. direct split chunk
|
// empty result. direct split chunk
|
||||||
if (result.length === 0) {
|
if (result.length === 0) {
|
||||||
const { chunks } = splitText2Chunks({ text: rawText, chunkLen: 512 });
|
const { chunks } = splitText2Chunks({
|
||||||
|
text: rawText,
|
||||||
|
chunkSize: chunkAutoChunkSize,
|
||||||
|
maxSize: getLLMMaxChunkSize(llmModel)
|
||||||
|
});
|
||||||
chunks.forEach((chunk) => {
|
chunks.forEach((chunk) => {
|
||||||
result.push({
|
result.push({
|
||||||
q: chunk,
|
q: chunk,
|
||||||
|
|||||||
@ -245,7 +245,7 @@ const insertData = async ({
|
|||||||
a: trainingData.a,
|
a: trainingData.a,
|
||||||
chunkIndex: trainingData.chunkIndex,
|
chunkIndex: trainingData.chunkIndex,
|
||||||
indexes: trainingData.indexes,
|
indexes: trainingData.indexes,
|
||||||
model: trainingData.model,
|
embeddingModel: trainingData.model,
|
||||||
session
|
session
|
||||||
});
|
});
|
||||||
// delete data from training
|
// delete data from training
|
||||||
|
|||||||
@ -60,15 +60,11 @@ export const defaultCollectionDetail: DatasetCollectionItemType = {
|
|||||||
createTime: new Date(),
|
createTime: new Date(),
|
||||||
trainingType: DatasetCollectionDataProcessModeEnum.chunk,
|
trainingType: DatasetCollectionDataProcessModeEnum.chunk,
|
||||||
chunkSize: 0,
|
chunkSize: 0,
|
||||||
|
indexSize: 512,
|
||||||
permission: new DatasetPermission(),
|
permission: new DatasetPermission(),
|
||||||
indexAmount: 0
|
indexAmount: 0
|
||||||
};
|
};
|
||||||
|
|
||||||
export enum ChunkSettingModeEnum {
|
|
||||||
auto = 'auto',
|
|
||||||
custom = 'custom'
|
|
||||||
}
|
|
||||||
|
|
||||||
export const datasetTypeCourseMap: Record<`${DatasetTypeEnum}`, string> = {
|
export const datasetTypeCourseMap: Record<`${DatasetTypeEnum}`, string> = {
|
||||||
[DatasetTypeEnum.folder]: '',
|
[DatasetTypeEnum.folder]: '',
|
||||||
[DatasetTypeEnum.dataset]: '',
|
[DatasetTypeEnum.dataset]: '',
|
||||||
|
|||||||
4
projects/app/src/web/core/dataset/type.d.ts
vendored
4
projects/app/src/web/core/dataset/type.d.ts
vendored
@ -1,6 +1,6 @@
|
|||||||
import type { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api';
|
import type { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api';
|
||||||
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||||
import { ChunkSettingModeEnum } from './constants';
|
import { ChunkSettingModeEnum } from '@fastgpt/global/core/dataset/constants';
|
||||||
import { UseFormReturn } from 'react-hook-form';
|
import { UseFormReturn } from 'react-hook-form';
|
||||||
import { APIFileItem } from '@fastgpt/global/core/dataset/apiDataset';
|
import { APIFileItem } from '@fastgpt/global/core/dataset/apiDataset';
|
||||||
|
|
||||||
@ -41,7 +41,7 @@ export type ImportSourceParamsType = UseFormReturn<
|
|||||||
{
|
{
|
||||||
chunkSize: number;
|
chunkSize: number;
|
||||||
chunkOverlapRatio: number;
|
chunkOverlapRatio: number;
|
||||||
customSplitChar: string;
|
chunkSplitter: string;
|
||||||
prompt: string;
|
prompt: string;
|
||||||
mode: TrainingModeEnum;
|
mode: TrainingModeEnum;
|
||||||
way: ChunkSettingModeEnum;
|
way: ChunkSettingModeEnum;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user