feat: chunk index independent config (#4271)

* sync collection

* remove lock

* feat: chunk index independent config

* feat: add max chunksize to split chunk function

* remove log

* update doc

* remove

* remove log
This commit is contained in:
Archer 2025-03-21 16:44:25 +08:00 committed by archer
parent 222ff0d49a
commit e812ad6e84
No known key found for this signature in database
GPG Key ID: 4446499B846D4A9E
47 changed files with 784 additions and 443 deletions

View File

@ -11,8 +11,6 @@ weight: 853
| --------------------- | --------------------- | | --------------------- | --------------------- |
| ![](/imgs/getDatasetId.jpg) | ![](/imgs/getfile_id.webp) | | ![](/imgs/getDatasetId.jpg) | ![](/imgs/getfile_id.webp) |
## 创建训练订单 ## 创建训练订单
{{< tabs tabTotal="2" >}} {{< tabs tabTotal="2" >}}
@ -289,7 +287,7 @@ curl --location --request DELETE 'http://localhost:3000/api/core/dataset/delete?
## 集合 ## 集合
### 通用创建参数说明 ### 通用创建参数说明(必看)
**入参** **入参**
@ -300,8 +298,11 @@ curl --location --request DELETE 'http://localhost:3000/api/core/dataset/delete?
| trainingType | 数据处理方式。chunk: 按文本长度进行分割;qa: 问答对提取 | ✅ | | trainingType | 数据处理方式。chunk: 按文本长度进行分割;qa: 问答对提取 | ✅ |
| autoIndexes | 是否自动生成索引(仅商业版支持) | | | autoIndexes | 是否自动生成索引(仅商业版支持) | |
| imageIndex | 是否自动生成图片索引(仅商业版支持) | | | imageIndex | 是否自动生成图片索引(仅商业版支持) | |
| chunkSize | 预估块大小 | | | chunkSettingMode | 分块参数模式。auto: 系统默认参数; custom: 手动指定参数 | |
| chunkSplitter | 自定义最高优先分割符号 | | | chunkSplitMode | 分块拆分模式。size: 按长度拆分; char: 按字符拆分。chunkSettingMode=auto时不生效。 | |
| chunkSize | 分块大小,默认 1500。chunkSettingMode=auto时不生效。 | |
| indexSize | 索引大小,默认 512必须小于索引模型最大token。chunkSettingMode=auto时不生效。 | |
| chunkSplitter | 自定义最高优先分割符号除非超出文件处理最大上下文否则不会进行进一步拆分。chunkSettingMode=auto时不生效。 | |
| qaPrompt | qa拆分提示词 | | | qaPrompt | qa拆分提示词 | |
| tags | 集合标签(字符串数组) | | | tags | 集合标签(字符串数组) | |
| createTime | 文件创建时间Date / String | | | createTime | 文件创建时间Date / String | |
@ -389,9 +390,8 @@ curl --location --request POST 'http://localhost:3000/api/core/dataset/collectio
"name":"测试训练", "name":"测试训练",
"trainingType": "qa", "trainingType": "qa",
"chunkSize":8000, "chunkSettingMode": "auto",
"chunkSplitter":"", "qaPrompt":"",
"qaPrompt":"11",
"metadata":{} "metadata":{}
}' }'
@ -409,10 +409,6 @@ curl --location --request POST 'http://localhost:3000/api/core/dataset/collectio
- parentId 父级ID不填则默认为根目录 - parentId 父级ID不填则默认为根目录
- name: 集合名称(必填) - name: 集合名称(必填)
- metadata 元数据(暂时没啥用) - metadata 元数据(暂时没啥用)
- trainingType: 训练模式(必填)
- chunkSize: 每个 chunk 的长度(可选). chunk模式:100~3000; qa模式: 4000~模型最大token16k模型通常建议不超过10000
- chunkSplitter: 自定义最高优先分割符号(可选)
- qaPrompt: qa拆分自定义提示词可选
{{% /alert %}} {{% /alert %}}
{{< /markdownify >}} {{< /markdownify >}}
@ -462,8 +458,7 @@ curl --location --request POST 'http://localhost:3000/api/core/dataset/collectio
"parentId": null, "parentId": null,
"trainingType": "chunk", "trainingType": "chunk",
"chunkSize":512, "chunkSettingMode": "auto",
"chunkSplitter":"",
"qaPrompt":"", "qaPrompt":"",
"metadata":{ "metadata":{
@ -483,10 +478,6 @@ curl --location --request POST 'http://localhost:3000/api/core/dataset/collectio
- datasetId: 知识库的ID(必填) - datasetId: 知识库的ID(必填)
- parentId 父级ID不填则默认为根目录 - parentId 父级ID不填则默认为根目录
- metadata.webPageSelector: 网页选择器,用于指定网页中的哪个元素作为文本(可选) - metadata.webPageSelector: 网页选择器,用于指定网页中的哪个元素作为文本(可选)
- trainingType:训练模式(必填)
- chunkSize: 每个 chunk 的长度(可选). chunk模式:100~3000; qa模式: 4000~模型最大token16k模型通常建议不超过10000
- chunkSplitter: 自定义最高优先分割符号(可选)
- qaPrompt: qa拆分自定义提示词可选
{{% /alert %}} {{% /alert %}}
{{< /markdownify >}} {{< /markdownify >}}
@ -545,13 +536,7 @@ curl --location --request POST 'http://localhost:3000/api/core/dataset/collectio
{{% alert icon=" " context="success" %}} {{% alert icon=" " context="success" %}}
- file: 文件 - file: 文件
- data: 知识库相关信息json序列化后传入 - data: 知识库相关信息json序列化后传入,参数说明见上方“通用创建参数说明”
- datasetId: 知识库的ID(必填)
- parentId 父级ID不填则默认为根目录
- trainingType:训练模式(必填)
- chunkSize: 每个 chunk 的长度(可选). chunk模式:100~3000; qa模式: 4000~模型最大token16k模型通常建议不超过10000
- chunkSplitter: 自定义最高优先分割符号(可选)
- qaPrompt: qa拆分自定义提示词可选
{{% /alert %}} {{% /alert %}}
{{< /markdownify >}} {{< /markdownify >}}

View File

@ -7,12 +7,17 @@ toc: true
weight: 799 weight: 799
--- ---
## 重要提示
- 知识库导入数据 API 变更,增加`chunkSettingMode`,`chunkSplitMode`,`indexSize`可选参数,具体可参考 [知识库导入数据 API](/docs/development/openapi/dataset) 文档。
## 🚀 新增内容 ## 🚀 新增内容
1. 知识库分块增加自定义分隔符预设值,同时支持自定义换行符分割。 1. 知识库分块优化:支持单独配置分块大小和索引大小,允许进行超大分块,以更大的输入 Tokens 换取完整分块。
2. 外部变量改名:自定义变量。 并且支持在测试时调试,在分享链接中,该变量直接隐藏。 2. 知识库分块增加自定义分隔符预设值,同时支持自定义换行符分割。
3. 集合同步时,支持同步修改标题。 3. 外部变量改名:自定义变量。 并且支持在测试时调试,在分享链接中,该变量直接隐藏。
4. 集合同步时,支持同步修改标题。
## ⚙️ 优化 ## ⚙️ 优化

View File

@ -1,15 +1,17 @@
import { defaultMaxChunkSize } from '../../core/dataset/training/utils';
import { getErrText } from '../error/utils'; import { getErrText } from '../error/utils';
export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----'; export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----';
type SplitProps = { type SplitProps = {
text: string; text: string;
chunkLen: number; chunkSize: number;
maxSize?: number;
overlapRatio?: number; overlapRatio?: number;
customReg?: string[]; customReg?: string[];
}; };
export type TextSplitProps = Omit<SplitProps, 'text' | 'chunkLen'> & { export type TextSplitProps = Omit<SplitProps, 'text' | 'chunkSize'> & {
chunkLen?: number; chunkSize?: number;
}; };
type SplitResponse = { type SplitResponse = {
@ -55,7 +57,7 @@ const strIsMdTable = (str: string) => {
return true; return true;
}; };
const markdownTableSplit = (props: SplitProps): SplitResponse => { const markdownTableSplit = (props: SplitProps): SplitResponse => {
let { text = '', chunkLen } = props; let { text = '', chunkSize } = props;
const splitText2Lines = text.split('\n'); const splitText2Lines = text.split('\n');
const header = splitText2Lines[0]; const header = splitText2Lines[0];
const headerSize = header.split('|').length - 2; const headerSize = header.split('|').length - 2;
@ -71,7 +73,7 @@ ${mdSplitString}
`; `;
for (let i = 2; i < splitText2Lines.length; i++) { for (let i = 2; i < splitText2Lines.length; i++) {
if (chunk.length + splitText2Lines[i].length > chunkLen * 1.2) { if (chunk.length + splitText2Lines[i].length > chunkSize * 1.2) {
chunks.push(chunk); chunks.push(chunk);
chunk = `${header} chunk = `${header}
${mdSplitString} ${mdSplitString}
@ -98,11 +100,17 @@ ${mdSplitString}
5. 5.
*/ */
const commonSplit = (props: SplitProps): SplitResponse => { const commonSplit = (props: SplitProps): SplitResponse => {
let { text = '', chunkLen, overlapRatio = 0.15, customReg = [] } = props; let {
text = '',
chunkSize,
maxSize = defaultMaxChunkSize,
overlapRatio = 0.15,
customReg = []
} = props;
const splitMarker = 'SPLIT_HERE_SPLIT_HERE'; const splitMarker = 'SPLIT_HERE_SPLIT_HERE';
const codeBlockMarker = 'CODE_BLOCK_LINE_MARKER'; const codeBlockMarker = 'CODE_BLOCK_LINE_MARKER';
const overlapLen = Math.round(chunkLen * overlapRatio); const overlapLen = Math.round(chunkSize * overlapRatio);
// replace code block all \n to codeBlockMarker // replace code block all \n to codeBlockMarker
text = text.replace(/(```[\s\S]*?```|~~~[\s\S]*?~~~)/g, function (match) { text = text.replace(/(```[\s\S]*?```|~~~[\s\S]*?~~~)/g, function (match) {
@ -118,24 +126,24 @@ const commonSplit = (props: SplitProps): SplitResponse => {
const stepReges: { reg: RegExp | string; maxLen: number }[] = [ const stepReges: { reg: RegExp | string; maxLen: number }[] = [
...customReg.map((text) => ({ ...customReg.map((text) => ({
reg: text.replaceAll('\\n', '\n'), reg: text.replaceAll('\\n', '\n'),
maxLen: chunkLen * 1.4 maxLen: chunkSize
})), })),
{ reg: /^(#\s[^\n]+\n)/gm, maxLen: chunkLen * 1.2 }, { reg: /^(#\s[^\n]+\n)/gm, maxLen: chunkSize },
{ reg: /^(##\s[^\n]+\n)/gm, maxLen: chunkLen * 1.4 }, { reg: /^(##\s[^\n]+\n)/gm, maxLen: chunkSize },
{ reg: /^(###\s[^\n]+\n)/gm, maxLen: chunkLen * 1.6 }, { reg: /^(###\s[^\n]+\n)/gm, maxLen: chunkSize },
{ reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkLen * 1.8 }, { reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkSize },
{ reg: /^(#####\s[^\n]+\n)/gm, maxLen: chunkLen * 1.8 }, { reg: /^(#####\s[^\n]+\n)/gm, maxLen: chunkSize },
{ reg: /([\n]([`~]))/g, maxLen: chunkLen * 4 }, // code block { reg: /([\n]([`~]))/g, maxLen: chunkSize }, // code block
{ reg: /([\n](?=\s*[0-9]+\.))/g, maxLen: chunkLen * 2 }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char { reg: /([\n](?=\s*[0-9]+\.))/g, maxLen: chunkSize }, // 增大块,尽可能保证它是一个完整的段落。 (?![\*\-|>`0-9]): markdown special char
{ reg: /(\n{2,})/g, maxLen: chunkLen * 1.6 }, { reg: /(\n{2,})/g, maxLen: chunkSize },
{ reg: /([\n])/g, maxLen: chunkLen * 1.2 }, { reg: /([\n])/g, maxLen: chunkSize },
// ------ There's no overlap on the top // ------ There's no overlap on the top
{ reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.2 }, { reg: /([。]|([a-zA-Z])\.\s)/g, maxLen: chunkSize },
{ reg: /([]|!\s)/g, maxLen: chunkLen * 1.2 }, { reg: /([]|!\s)/g, maxLen: chunkSize },
{ reg: /([]|\?\s)/g, maxLen: chunkLen * 1.4 }, { reg: /([]|\?\s)/g, maxLen: chunkSize },
{ reg: /([]|;\s)/g, maxLen: chunkLen * 1.6 }, { reg: /([]|;\s)/g, maxLen: chunkSize },
{ reg: /([]|,\s)/g, maxLen: chunkLen * 2 } { reg: /([]|,\s)/g, maxLen: chunkSize }
]; ];
const customRegLen = customReg.length; const customRegLen = customReg.length;
@ -203,7 +211,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
/* Gets the overlap at the end of a text as the beginning of the next block */ /* Gets the overlap at the end of a text as the beginning of the next block */
const getOneTextOverlapText = ({ text, step }: { text: string; step: number }): string => { const getOneTextOverlapText = ({ text, step }: { text: string; step: number }): string => {
const forbidOverlap = checkForbidOverlap(step); const forbidOverlap = checkForbidOverlap(step);
const maxOverlapLen = chunkLen * 0.4; const maxOverlapLen = chunkSize * 0.4;
// step >= stepReges.length: Do not overlap incomplete sentences // step >= stepReges.length: Do not overlap incomplete sentences
if (forbidOverlap || overlapLen === 0 || step >= stepReges.length) return ''; if (forbidOverlap || overlapLen === 0 || step >= stepReges.length) return '';
@ -246,13 +254,13 @@ const commonSplit = (props: SplitProps): SplitResponse => {
// oversize // oversize
if (step >= stepReges.length) { if (step >= stepReges.length) {
if (text.length < chunkLen * 3) { if (text.length < chunkSize * 3) {
return [text]; return [text];
} }
// use slice-chunkLen to split text // use slice-chunkSize to split text
const chunks: string[] = []; const chunks: string[] = [];
for (let i = 0; i < text.length; i += chunkLen - overlapLen) { for (let i = 0; i < text.length; i += chunkSize - overlapLen) {
chunks.push(text.slice(i, i + chunkLen)); chunks.push(text.slice(i, i + chunkSize));
} }
return chunks; return chunks;
} }
@ -260,8 +268,8 @@ const commonSplit = (props: SplitProps): SplitResponse => {
// split text by special char // split text by special char
const splitTexts = getSplitTexts({ text, step }); const splitTexts = getSplitTexts({ text, step });
const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkLen; const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkSize;
const minChunkLen = chunkLen * 0.7; const minChunkLen = chunkSize * 0.7;
const chunks: string[] = []; const chunks: string[] = [];
for (let i = 0; i < splitTexts.length; i++) { for (let i = 0; i < splitTexts.length; i++) {
@ -297,7 +305,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
continue; continue;
} }
// newText is too large(now, The lastText must be smaller than chunkLen) // newText is too large(now, The lastText must be smaller than chunkSize)
if (newTextLen > maxLen) { if (newTextLen > maxLen) {
// lastText greater minChunkLen, direct push it to chunks, not add to next chunk. (large lastText) // lastText greater minChunkLen, direct push it to chunks, not add to next chunk. (large lastText)
if (lastTextLen > minChunkLen) { if (lastTextLen > minChunkLen) {
@ -352,7 +360,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
/* If the last chunk is independent, it needs to be push chunks. */ /* If the last chunk is independent, it needs to be push chunks. */
if (lastText && chunks[chunks.length - 1] && !chunks[chunks.length - 1].endsWith(lastText)) { if (lastText && chunks[chunks.length - 1] && !chunks[chunks.length - 1].endsWith(lastText)) {
if (lastText.length < chunkLen * 0.4) { if (lastText.length < chunkSize * 0.4) {
chunks[chunks.length - 1] = chunks[chunks.length - 1] + lastText; chunks[chunks.length - 1] = chunks[chunks.length - 1] + lastText;
} else { } else {
chunks.push(lastText); chunks.push(lastText);
@ -386,9 +394,9 @@ const commonSplit = (props: SplitProps): SplitResponse => {
/** /**
* text split into chunks * text split into chunks
* chunkLen - one chunk len. max: 3500 * chunkSize - one chunk len. max: 3500
* overlapLen - The size of the before and after Text * overlapLen - The size of the before and after Text
* chunkLen > overlapLen * chunkSize > overlapLen
* markdown * markdown
*/ */
export const splitText2Chunks = (props: SplitProps): SplitResponse => { export const splitText2Chunks = (props: SplitProps): SplitResponse => {

View File

@ -1,5 +1,10 @@
import { DatasetDataIndexItemType, DatasetSchemaType } from './type'; import { DatasetDataIndexItemType, DatasetSchemaType } from './type';
import { DatasetCollectionTypeEnum, DatasetCollectionDataProcessModeEnum } from './constants'; import {
DatasetCollectionTypeEnum,
DatasetCollectionDataProcessModeEnum,
ChunkSettingModeEnum,
DataChunkSplitModeEnum
} from './constants';
import type { LLMModelItemType } from '../ai/model.d'; import type { LLMModelItemType } from '../ai/model.d';
import { ParentIdType } from 'common/parentFolder/type'; import { ParentIdType } from 'common/parentFolder/type';
@ -33,7 +38,13 @@ export type DatasetCollectionChunkMetadataType = {
trainingType?: DatasetCollectionDataProcessModeEnum; trainingType?: DatasetCollectionDataProcessModeEnum;
imageIndex?: boolean; imageIndex?: boolean;
autoIndexes?: boolean; autoIndexes?: boolean;
chunkSettingMode?: ChunkSettingModeEnum;
chunkSplitMode?: DataChunkSplitModeEnum;
chunkSize?: number; chunkSize?: number;
indexSize?: number;
chunkSplitter?: string; chunkSplitter?: string;
qaPrompt?: string; qaPrompt?: string;
metadata?: Record<string, any>; metadata?: Record<string, any>;

View File

@ -129,6 +129,16 @@ export const DatasetCollectionDataProcessModeMap = {
} }
}; };
export enum ChunkSettingModeEnum {
auto = 'auto',
custom = 'custom'
}
export enum DataChunkSplitModeEnum {
size = 'size',
char = 'char'
}
/* ------------ data -------------- */ /* ------------ data -------------- */
/* ------------ training -------------- */ /* ------------ training -------------- */

View File

@ -13,6 +13,7 @@ export type CreateDatasetDataProps = {
export type UpdateDatasetDataProps = { export type UpdateDatasetDataProps = {
dataId: string; dataId: string;
q?: string; q?: string;
a?: string; a?: string;
indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & { indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & {

View File

@ -15,6 +15,8 @@ export type PushDataToTrainingQueueProps = {
vectorModel: string; vectorModel: string;
vlmModel?: string; vlmModel?: string;
indexSize?: number;
billId?: string; billId?: string;
session?: ClientSession; session?: ClientSession;
}; };

View File

@ -0,0 +1,136 @@
import { EmbeddingModelItemType, LLMModelItemType } from '../../../core/ai/model.d';
import {
ChunkSettingModeEnum,
DataChunkSplitModeEnum,
DatasetCollectionDataProcessModeEnum
} from '../constants';
export const minChunkSize = 64; // min index and chunk size
// Chunk size
export const chunkAutoChunkSize = 1500;
export const getMaxChunkSize = (model: LLMModelItemType) => {
return Math.max(model.maxContext - model.maxResponse, 2000);
};
// QA
export const defaultMaxChunkSize = 8000;
export const getLLMDefaultChunkSize = (model?: LLMModelItemType) => {
if (!model) return defaultMaxChunkSize;
return Math.max(Math.min(model.maxContext - model.maxResponse, defaultMaxChunkSize), 2000);
};
export const getLLMMaxChunkSize = (model?: LLMModelItemType) => {
if (!model) return 8000;
return Math.max(model.maxContext - model.maxResponse, 2000);
};
// Index size
export const getMaxIndexSize = (model?: EmbeddingModelItemType) => {
return model?.maxToken || 512;
};
export const getAutoIndexSize = (model?: EmbeddingModelItemType) => {
return model?.defaultToken || 512;
};
const indexSizeSelectList = [
{
label: '64',
value: 64
},
{
label: '128',
value: 128
},
{
label: '256',
value: 256
},
{
label: '512',
value: 512
},
{
label: '768',
value: 768
},
{
label: '1024',
value: 1024
},
{
label: '1536',
value: 1536
},
{
label: '2048',
value: 2048
},
{
label: '3072',
value: 3072
},
{
label: '4096',
value: 4096
},
{
label: '5120',
value: 5120
},
{
label: '6144',
value: 6144
},
{
label: '7168',
value: 7168
},
{
label: '8192',
value: 8192
}
];
export const getIndexSizeSelectList = (max = 512) => {
return indexSizeSelectList.filter((item) => item.value <= max);
};
// Compute
export const computeChunkSize = (params: {
trainingType: DatasetCollectionDataProcessModeEnum;
chunkSettingMode?: ChunkSettingModeEnum;
chunkSplitMode?: DataChunkSplitModeEnum;
llmModel?: LLMModelItemType;
chunkSize?: number;
}) => {
if (params.trainingType === DatasetCollectionDataProcessModeEnum.qa) {
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
return getLLMDefaultChunkSize(params.llmModel);
}
} else {
// chunk
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
return chunkAutoChunkSize;
}
}
if (params.chunkSplitMode === DataChunkSplitModeEnum.char) {
return getLLMMaxChunkSize(params.llmModel);
}
return Math.min(params.chunkSize || chunkAutoChunkSize, getLLMMaxChunkSize(params.llmModel));
};
export const computeChunkSplitter = (params: {
chunkSettingMode?: ChunkSettingModeEnum;
chunkSplitMode?: DataChunkSplitModeEnum;
chunkSplitter?: string;
}) => {
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
return undefined;
}
if (params.chunkSplitMode === DataChunkSplitModeEnum.size) {
return undefined;
}
return params.chunkSplitter;
};

View File

@ -2,6 +2,7 @@ import type { LLMModelItemType, EmbeddingModelItemType } from '../../core/ai/mod
import { PermissionTypeEnum } from '../../support/permission/constant'; import { PermissionTypeEnum } from '../../support/permission/constant';
import { PushDatasetDataChunkProps } from './api'; import { PushDatasetDataChunkProps } from './api';
import { import {
DataChunkSplitModeEnum,
DatasetCollectionDataProcessModeEnum, DatasetCollectionDataProcessModeEnum,
DatasetCollectionTypeEnum, DatasetCollectionTypeEnum,
DatasetStatusEnum, DatasetStatusEnum,
@ -14,6 +15,7 @@ import { Permission } from '../../support/permission/controller';
import { APIFileServer, FeishuServer, YuqueServer } from './apiDataset'; import { APIFileServer, FeishuServer, YuqueServer } from './apiDataset';
import { SourceMemberType } from 'support/user/type'; import { SourceMemberType } from 'support/user/type';
import { DatasetDataIndexTypeEnum } from './data/constants'; import { DatasetDataIndexTypeEnum } from './data/constants';
import { ChunkSettingModeEnum } from './constants';
export type DatasetSchemaType = { export type DatasetSchemaType = {
_id: string; _id: string;
@ -88,7 +90,12 @@ export type DatasetCollectionSchemaType = {
autoIndexes?: boolean; autoIndexes?: boolean;
imageIndex?: boolean; imageIndex?: boolean;
trainingType: DatasetCollectionDataProcessModeEnum; trainingType: DatasetCollectionDataProcessModeEnum;
chunkSize: number;
chunkSettingMode?: ChunkSettingModeEnum;
chunkSplitMode?: DataChunkSplitModeEnum;
chunkSize?: number;
indexSize?: number;
chunkSplitter?: string; chunkSplitter?: string;
qaPrompt?: string; qaPrompt?: string;
}; };

View File

@ -1,7 +1,6 @@
import { TrainingModeEnum, DatasetCollectionTypeEnum } from './constants'; import { TrainingModeEnum, DatasetCollectionTypeEnum } from './constants';
import { getFileIcon } from '../../common/file/icon'; import { getFileIcon } from '../../common/file/icon';
import { strIsLink } from '../../common/string/tools'; import { strIsLink } from '../../common/string/tools';
import { DatasetDataIndexTypeEnum } from './data/constants';
export function getCollectionIcon( export function getCollectionIcon(
type: DatasetCollectionTypeEnum = DatasetCollectionTypeEnum.file, type: DatasetCollectionTypeEnum = DatasetCollectionTypeEnum.file,
@ -38,26 +37,6 @@ export function getSourceNameIcon({
return 'file/fill/file'; return 'file/fill/file';
} }
/* get dataset data default index */
export function getDefaultIndex(props?: { q?: string; a?: string }) {
const { q = '', a } = props || {};
return [
{
text: q,
type: DatasetDataIndexTypeEnum.default
},
...(a
? [
{
text: a,
type: DatasetDataIndexTypeEnum.default
}
]
: [])
];
}
export const predictDataLimitLength = (mode: TrainingModeEnum, data: any[]) => { export const predictDataLimitLength = (mode: TrainingModeEnum, data: any[]) => {
if (mode === TrainingModeEnum.qa) return data.length * 20; if (mode === TrainingModeEnum.qa) return data.length * 20;
if (mode === TrainingModeEnum.auto) return data.length * 5; if (mode === TrainingModeEnum.auto) return data.length * 5;

View File

@ -27,6 +27,11 @@ import { addDays } from 'date-fns';
import { MongoDatasetDataText } from '../data/dataTextSchema'; import { MongoDatasetDataText } from '../data/dataTextSchema';
import { retryFn } from '@fastgpt/global/common/system/utils'; import { retryFn } from '@fastgpt/global/common/system/utils';
import { getTrainingModeByCollection } from './utils'; import { getTrainingModeByCollection } from './utils';
import {
computeChunkSize,
computeChunkSplitter,
getLLMMaxChunkSize
} from '@fastgpt/global/core/dataset/training/utils';
export const createCollectionAndInsertData = async ({ export const createCollectionAndInsertData = async ({
dataset, dataset,
@ -54,18 +59,22 @@ export const createCollectionAndInsertData = async ({
const teamId = createCollectionParams.teamId; const teamId = createCollectionParams.teamId;
const tmbId = createCollectionParams.tmbId; const tmbId = createCollectionParams.tmbId;
// Chunk split params
// Set default params
const trainingType = const trainingType =
createCollectionParams.trainingType || DatasetCollectionDataProcessModeEnum.chunk; createCollectionParams.trainingType || DatasetCollectionDataProcessModeEnum.chunk;
const chunkSize = createCollectionParams.chunkSize || 512; const chunkSize = computeChunkSize({
const chunkSplitter = createCollectionParams.chunkSplitter; ...createCollectionParams,
const qaPrompt = createCollectionParams.qaPrompt; trainingType,
const usageName = createCollectionParams.name; llmModel: getLLMModel(dataset.agentModel)
});
const chunkSplitter = computeChunkSplitter(createCollectionParams);
// 1. split chunks // 1. split chunks
const chunks = rawText2Chunks({ const chunks = rawText2Chunks({
rawText, rawText,
chunkLen: chunkSize, chunkSize,
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0, overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
customReg: chunkSplitter ? [chunkSplitter] : [], customReg: chunkSplitter ? [chunkSplitter] : [],
isQAImport isQAImport
@ -76,7 +85,7 @@ export const createCollectionAndInsertData = async ({
teamId, teamId,
insertLen: predictDataLimitLength( insertLen: predictDataLimitLength(
getTrainingModeByCollection({ getTrainingModeByCollection({
trainingType, trainingType: trainingType,
autoIndexes: createCollectionParams.autoIndexes, autoIndexes: createCollectionParams.autoIndexes,
imageIndex: createCollectionParams.imageIndex imageIndex: createCollectionParams.imageIndex
}), }),
@ -88,6 +97,9 @@ export const createCollectionAndInsertData = async ({
// 3. create collection // 3. create collection
const { _id: collectionId } = await createOneCollection({ const { _id: collectionId } = await createOneCollection({
...createCollectionParams, ...createCollectionParams,
trainingType,
chunkSize,
chunkSplitter,
hashRawText: hashStr(rawText), hashRawText: hashStr(rawText),
rawTextLength: rawText.length, rawTextLength: rawText.length,
@ -111,7 +123,7 @@ export const createCollectionAndInsertData = async ({
const { billId: newBillId } = await createTrainingUsage({ const { billId: newBillId } = await createTrainingUsage({
teamId, teamId,
tmbId, tmbId,
appName: usageName, appName: createCollectionParams.name,
billSource: UsageSourceEnum.training, billSource: UsageSourceEnum.training,
vectorModel: getEmbeddingModel(dataset.vectorModel)?.name, vectorModel: getEmbeddingModel(dataset.vectorModel)?.name,
agentModel: getLLMModel(dataset.agentModel)?.name, agentModel: getLLMModel(dataset.agentModel)?.name,
@ -130,12 +142,13 @@ export const createCollectionAndInsertData = async ({
agentModel: dataset.agentModel, agentModel: dataset.agentModel,
vectorModel: dataset.vectorModel, vectorModel: dataset.vectorModel,
vlmModel: dataset.vlmModel, vlmModel: dataset.vlmModel,
indexSize: createCollectionParams.indexSize,
mode: getTrainingModeByCollection({ mode: getTrainingModeByCollection({
trainingType, trainingType: trainingType,
autoIndexes: createCollectionParams.autoIndexes, autoIndexes: createCollectionParams.autoIndexes,
imageIndex: createCollectionParams.imageIndex imageIndex: createCollectionParams.imageIndex
}), }),
prompt: qaPrompt, prompt: createCollectionParams.qaPrompt,
billId: traingBillId, billId: traingBillId,
data: chunks.map((item, index) => ({ data: chunks.map((item, index) => ({
...item, ...item,
@ -207,11 +220,14 @@ export async function createOneCollection({
// Parse settings // Parse settings
customPdfParse, customPdfParse,
imageIndex, imageIndex,
autoIndexes,
// Chunk settings // Chunk settings
trainingType = DatasetCollectionDataProcessModeEnum.chunk, trainingType,
autoIndexes, chunkSettingMode,
chunkSize = 512, chunkSplitMode,
chunkSize,
indexSize,
chunkSplitter, chunkSplitter,
qaPrompt, qaPrompt,
@ -249,11 +265,14 @@ export async function createOneCollection({
// Parse settings // Parse settings
customPdfParse, customPdfParse,
imageIndex, imageIndex,
autoIndexes,
// Chunk settings // Chunk settings
trainingType, trainingType,
autoIndexes, chunkSettingMode,
chunkSplitMode,
chunkSize, chunkSize,
indexSize,
chunkSplitter, chunkSplitter,
qaPrompt qaPrompt
} }

View File

@ -3,7 +3,9 @@ const { Schema, model, models } = connectionMongo;
import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d'; import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type.d';
import { import {
DatasetCollectionTypeMap, DatasetCollectionTypeMap,
DatasetCollectionDataProcessModeEnum DatasetCollectionDataProcessModeEnum,
ChunkSettingModeEnum,
DataChunkSplitModeEnum
} from '@fastgpt/global/core/dataset/constants'; } from '@fastgpt/global/core/dataset/constants';
import { DatasetCollectionName } from '../schema'; import { DatasetCollectionName } from '../schema';
import { import {
@ -94,11 +96,18 @@ const DatasetCollectionSchema = new Schema({
type: String, type: String,
enum: Object.values(DatasetCollectionDataProcessModeEnum) enum: Object.values(DatasetCollectionDataProcessModeEnum)
}, },
chunkSize: { chunkSettingMode: {
type: Number, type: String,
required: true enum: Object.values(ChunkSettingModeEnum)
}, },
chunkSplitMode: {
type: String,
enum: Object.values(DataChunkSplitModeEnum)
},
chunkSize: Number,
chunkSplitter: String, chunkSplitter: String,
indexSize: Number,
qaPrompt: String qaPrompt: String
}); });

View File

@ -185,7 +185,7 @@ export const readApiServerFileContent = async ({
export const rawText2Chunks = ({ export const rawText2Chunks = ({
rawText, rawText,
isQAImport, isQAImport,
chunkLen = 512, chunkSize = 512,
...splitProps ...splitProps
}: { }: {
rawText: string; rawText: string;
@ -198,7 +198,7 @@ export const rawText2Chunks = ({
const { chunks } = splitText2Chunks({ const { chunks } = splitText2Chunks({
text: rawText, text: rawText,
chunkLen, chunkSize,
...splitProps ...splitProps
}); });

View File

@ -12,6 +12,10 @@ import { getCollectionWithDataset } from '../controller';
import { mongoSessionRun } from '../../../common/mongo/sessionRun'; import { mongoSessionRun } from '../../../common/mongo/sessionRun';
import { PushDataToTrainingQueueProps } from '@fastgpt/global/core/dataset/training/type'; import { PushDataToTrainingQueueProps } from '@fastgpt/global/core/dataset/training/type';
import { i18nT } from '../../../../web/i18n/utils'; import { i18nT } from '../../../../web/i18n/utils';
import {
getLLMDefaultChunkSize,
getLLMMaxChunkSize
} from '../../../../global/core/dataset/training/utils';
export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> => { export const lockTrainingDataByTeamId = async (teamId: string): Promise<any> => {
try { try {
@ -55,6 +59,7 @@ export async function pushDataListToTrainingQueue({
prompt, prompt,
billId, billId,
mode = TrainingModeEnum.chunk, mode = TrainingModeEnum.chunk,
indexSize,
session session
}: PushDataToTrainingQueueProps): Promise<PushDatasetDataResponse> { }: PushDataToTrainingQueueProps): Promise<PushDatasetDataResponse> {
const getImageChunkMode = (data: PushDatasetDataChunkProps, mode: TrainingModeEnum) => { const getImageChunkMode = (data: PushDatasetDataChunkProps, mode: TrainingModeEnum) => {
@ -68,38 +73,41 @@ export async function pushDataListToTrainingQueue({
} }
return mode; return mode;
}; };
const vectorModelData = getEmbeddingModel(vectorModel);
if (!vectorModelData) {
return Promise.reject(i18nT('common:error_embedding_not_config'));
}
const agentModelData = getLLMModel(agentModel);
if (!agentModelData) {
return Promise.reject(i18nT('common:error_llm_not_config'));
}
if (mode === TrainingModeEnum.chunk || mode === TrainingModeEnum.auto) {
prompt = undefined;
}
const { model, maxToken, weight } = await (async () => { const { model, maxToken, weight } = await (async () => {
if (mode === TrainingModeEnum.chunk) { if (mode === TrainingModeEnum.chunk) {
const vectorModelData = getEmbeddingModel(vectorModel);
if (!vectorModelData) {
return Promise.reject(i18nT('common:error_embedding_not_config'));
}
return { return {
maxToken: vectorModelData.maxToken * 1.5, maxToken: getLLMMaxChunkSize(agentModelData),
model: vectorModelData.model, model: vectorModelData.model,
weight: vectorModelData.weight weight: vectorModelData.weight
}; };
} }
if (mode === TrainingModeEnum.qa || mode === TrainingModeEnum.auto) { if (mode === TrainingModeEnum.qa || mode === TrainingModeEnum.auto) {
const agentModelData = getLLMModel(agentModel);
if (!agentModelData) {
return Promise.reject(i18nT('common:error_llm_not_config'));
}
return { return {
maxToken: agentModelData.maxContext * 0.8, maxToken: getLLMMaxChunkSize(agentModelData),
model: agentModelData.model, model: agentModelData.model,
weight: 0 weight: 0
}; };
} }
if (mode === TrainingModeEnum.image) { if (mode === TrainingModeEnum.image) {
const vllmModelData = getVlmModel(vlmModel); const vllmModelData = getVlmModel(vlmModel);
if (!vllmModelData) { if (!vllmModelData) {
return Promise.reject(i18nT('common:error_vlm_not_config')); return Promise.reject(i18nT('common:error_vlm_not_config'));
} }
return { return {
maxToken: vllmModelData.maxContext * 0.8, maxToken: getLLMMaxChunkSize(vllmModelData),
model: vllmModelData.model, model: vllmModelData.model,
weight: 0 weight: 0
}; };
@ -107,10 +115,6 @@ export async function pushDataListToTrainingQueue({
return Promise.reject(`Training mode "${mode}" is inValid`); return Promise.reject(`Training mode "${mode}" is inValid`);
})(); })();
// Filter redundant params
if (mode === TrainingModeEnum.chunk || mode === TrainingModeEnum.auto) {
prompt = undefined;
}
// filter repeat or equal content // filter repeat or equal content
const set = new Set(); const set = new Set();
@ -143,13 +147,13 @@ export async function pushDataListToTrainingQueue({
const text = item.q + item.a; const text = item.q + item.a;
// Oversize llm tokens
if (text.length > maxToken) { if (text.length > maxToken) {
filterResult.overToken.push(item); filterResult.overToken.push(item);
return; return;
} }
if (set.has(text)) { if (set.has(text)) {
console.log('repeat', item);
filterResult.repeat.push(item); filterResult.repeat.push(item);
} else { } else {
filterResult.success.push(item); filterResult.success.push(item);
@ -182,6 +186,7 @@ export async function pushDataListToTrainingQueue({
q: item.q, q: item.q,
a: item.a, a: item.a,
chunkIndex: item.chunkIndex ?? 0, chunkIndex: item.chunkIndex ?? 0,
indexSize,
weight: weight ?? 0, weight: weight ?? 0,
indexes: item.indexes, indexes: item.indexes,
retryCount: 5 retryCount: 5

View File

@ -76,6 +76,7 @@ const TrainingDataSchema = new Schema({
type: Number, type: Number,
default: 0 default: 0
}, },
indexSize: Number,
weight: { weight: {
type: Number, type: Number,
default: 0 default: 0

View File

@ -72,7 +72,7 @@ const EditFolderModal = ({
{...register('name', { required: true })} {...register('name', { required: true })}
bg={'myGray.50'} bg={'myGray.50'}
autoFocus autoFocus
maxLength={20} maxLength={100}
/> />
</Box> </Box>
<Box mt={4}> <Box mt={4}>

View File

@ -0,0 +1,67 @@
import React from 'react';
import { Box, Flex, Grid, type GridProps, HStack } from '@chakra-ui/react';
import { useTranslation } from 'next-i18next';
import QuestionTip from '../MyTooltip/QuestionTip';
type Props<T> = Omit<GridProps, 'onChange'> & {
list: {
title: string;
value: T;
tooltip?: string;
}[];
value: T;
defaultBg?: string;
activeBg?: string;
onChange: (e: T) => void;
};
const RadioGroup = <T = any,>({ list, value, onChange, ...props }: Props<T>) => {
const { t } = useTranslation();
return (
<Flex gap={[3, 5]} fontSize={['sm', 'md']} alignItems={'center'} {...props}>
{list.map((item) => (
<Flex
alignItems={'center'}
key={item.value as any}
cursor={'pointer'}
userSelect={'none'}
gap={1}
onClick={() => onChange(item.value)}
>
<Box
w={'18px'}
h={'18px'}
borderWidth={'2.4px'}
borderColor={value === item.value ? 'primary.015' : 'transparent'}
borderRadius={'50%'}
>
<Flex
w={'100%'}
h={'100%'}
borderWidth={'1px'}
borderColor={value === item.value ? 'primary.600' : 'borderColor.high'}
bg={value === item.value ? 'primary.1' : 'transparent'}
borderRadius={'50%'}
alignItems={'center'}
justifyContent={'center'}
>
<Box
w={'5px'}
h={'5px'}
borderRadius={'50%'}
bg={value === item.value ? 'primary.600' : 'transparent'}
/>
</Flex>
</Box>
<HStack spacing={1} color={'myGray.900'} whiteSpace={'nowrap'} fontSize={'sm'}>
<Box>{typeof item.title === 'string' ? t(item.title as any) : item.title}</Box>
{!!item.tooltip && <QuestionTip label={item.tooltip} color={'myGray.600'} />}
</HStack>
</Flex>
))}
</Flex>
);
};
export default RadioGroup;

View File

@ -569,7 +569,6 @@
"core.dataset.import.Custom process": "Custom Rules", "core.dataset.import.Custom process": "Custom Rules",
"core.dataset.import.Custom process desc": "Customize segmentation and preprocessing rules", "core.dataset.import.Custom process desc": "Customize segmentation and preprocessing rules",
"core.dataset.import.Custom prompt": "Custom Prompt", "core.dataset.import.Custom prompt": "Custom Prompt",
"core.dataset.import.Custom split char": "Custom Separator",
"core.dataset.import.Custom text": "Custom Text", "core.dataset.import.Custom text": "Custom Text",
"core.dataset.import.Custom text desc": "Manually enter a piece of text as a dataset", "core.dataset.import.Custom text desc": "Manually enter a piece of text as a dataset",
"core.dataset.import.Data process params": "Data Processing Parameters", "core.dataset.import.Data process params": "Data Processing Parameters",

View File

@ -27,7 +27,6 @@
"custom_data_process_params": "Custom", "custom_data_process_params": "Custom",
"custom_data_process_params_desc": "Customize data processing rules", "custom_data_process_params_desc": "Customize data processing rules",
"custom_split_sign_tip": "Allows you to chunk according to custom delimiters. \nUsually used for processed data, using specific separators for precise chunking. \nYou can use the | symbol to represent multiple splitters, such as: \".|.\" to represent a period in Chinese and English.\n\nTry to avoid using special symbols related to regular, such as: * () [] {}, etc.", "custom_split_sign_tip": "Allows you to chunk according to custom delimiters. \nUsually used for processed data, using specific separators for precise chunking. \nYou can use the | symbol to represent multiple splitters, such as: \".|.\" to represent a period in Chinese and English.\n\nTry to avoid using special symbols related to regular, such as: * () [] {}, etc.",
"data.ideal_chunk_length": "ideal block length",
"data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes", "data_amount": "{{dataAmount}} Datas, {{indexAmount}} Indexes",
"data_index_num": "Index {{index}}", "data_index_num": "Index {{index}}",
"data_process_params": "Params", "data_process_params": "Params",
@ -53,8 +52,6 @@
"file_model_function_tip": "Enhances indexing and QA generation", "file_model_function_tip": "Enhances indexing and QA generation",
"filename": "Filename", "filename": "Filename",
"folder_dataset": "Folder", "folder_dataset": "Folder",
"ideal_chunk_length": "ideal block length",
"ideal_chunk_length_tips": "Segment according to the end symbol and combine multiple segments into one block. This value determines the estimated size of the block, if there is any fluctuation.",
"image_auto_parse": "Automatic image indexing", "image_auto_parse": "Automatic image indexing",
"image_auto_parse_tips": "Call VLM to automatically label the pictures in the document and generate additional search indexes", "image_auto_parse_tips": "Call VLM to automatically label the pictures in the document and generate additional search indexes",
"image_training_queue": "Queue of image processing", "image_training_queue": "Queue of image processing",
@ -68,6 +65,8 @@
"import_param_setting": "Parameter settings", "import_param_setting": "Parameter settings",
"import_select_file": "Select a file", "import_select_file": "Select a file",
"import_select_link": "Enter link", "import_select_link": "Enter link",
"index_size": "Index size",
"index_size_tips": "When vectorized, the system will automatically further segment the blocks according to this size.",
"is_open_schedule": "Enable scheduled synchronization", "is_open_schedule": "Enable scheduled synchronization",
"keep_image": "Keep the picture", "keep_image": "Keep the picture",
"move.hint": "After moving, the selected knowledge base/folder will inherit the permission settings of the new folder, and the original permission settings will become invalid.", "move.hint": "After moving, the selected knowledge base/folder will inherit the permission settings of the new folder, and the original permission settings will become invalid.",
@ -89,6 +88,8 @@
"retain_collection": "Adjust Training Parameters", "retain_collection": "Adjust Training Parameters",
"retrain_task_submitted": "The retraining task has been submitted", "retrain_task_submitted": "The retraining task has been submitted",
"same_api_collection": "The same API set exists", "same_api_collection": "The same API set exists",
"split_chunk_char": "Block by specified splitter",
"split_chunk_size": "Block by length",
"split_sign_break": "1 newline character", "split_sign_break": "1 newline character",
"split_sign_break2": "2 newline characters", "split_sign_break2": "2 newline characters",
"split_sign_custom": "Customize", "split_sign_custom": "Customize",

View File

@ -573,7 +573,6 @@
"core.dataset.import.Custom process": "自定义规则", "core.dataset.import.Custom process": "自定义规则",
"core.dataset.import.Custom process desc": "自定义设置数据处理规则", "core.dataset.import.Custom process desc": "自定义设置数据处理规则",
"core.dataset.import.Custom prompt": "自定义提示词", "core.dataset.import.Custom prompt": "自定义提示词",
"core.dataset.import.Custom split char": "自定义分隔符",
"core.dataset.import.Custom text": "自定义文本", "core.dataset.import.Custom text": "自定义文本",
"core.dataset.import.Custom text desc": "手动输入一段文本作为数据集", "core.dataset.import.Custom text desc": "手动输入一段文本作为数据集",
"core.dataset.import.Data process params": "数据处理参数", "core.dataset.import.Data process params": "数据处理参数",

View File

@ -27,7 +27,6 @@
"custom_data_process_params": "自定义", "custom_data_process_params": "自定义",
"custom_data_process_params_desc": "自定义设置数据处理规则", "custom_data_process_params_desc": "自定义设置数据处理规则",
"custom_split_sign_tip": "允许你根据自定义的分隔符进行分块。通常用于已处理好的数据,使用特定的分隔符来精确分块。可以使用 | 符号表示多个分割符,例如:“。|.” 表示中英文句号。\n尽量避免使用正则相关特殊符号例如: * () [] {} 等。", "custom_split_sign_tip": "允许你根据自定义的分隔符进行分块。通常用于已处理好的数据,使用特定的分隔符来精确分块。可以使用 | 符号表示多个分割符,例如:“。|.” 表示中英文句号。\n尽量避免使用正则相关特殊符号例如: * () [] {} 等。",
"data.ideal_chunk_length": "理想分块长度",
"data_amount": "{{dataAmount}} 组数据, {{indexAmount}} 组索引", "data_amount": "{{dataAmount}} 组数据, {{indexAmount}} 组索引",
"data_index_num": "索引 {{index}}", "data_index_num": "索引 {{index}}",
"data_process_params": "处理参数", "data_process_params": "处理参数",
@ -53,8 +52,6 @@
"file_model_function_tip": "用于增强索引和 QA 生成", "file_model_function_tip": "用于增强索引和 QA 生成",
"filename": "文件名", "filename": "文件名",
"folder_dataset": "文件夹", "folder_dataset": "文件夹",
"ideal_chunk_length": "理想分块长度",
"ideal_chunk_length_tips": "按结束符号进行分段,并将多个分段组成一个分块,该值决定了分块的预估大小,如果会有上下浮动。",
"image_auto_parse": "图片自动索引", "image_auto_parse": "图片自动索引",
"image_auto_parse_tips": "调用 VLM 自动标注文档里的图片,并生成额外的检索索引", "image_auto_parse_tips": "调用 VLM 自动标注文档里的图片,并生成额外的检索索引",
"image_training_queue": "图片处理排队", "image_training_queue": "图片处理排队",
@ -68,6 +65,8 @@
"import_param_setting": "参数设置", "import_param_setting": "参数设置",
"import_select_file": "选择文件", "import_select_file": "选择文件",
"import_select_link": "输入链接", "import_select_link": "输入链接",
"index_size": "索引大小",
"index_size_tips": "向量化时内容的长度,系统会自动按该大小对分块进行进一步的分割。",
"is_open_schedule": "启用定时同步", "is_open_schedule": "启用定时同步",
"keep_image": "保留图片", "keep_image": "保留图片",
"move.hint": "移动后,所选知识库/文件夹将继承新文件夹的权限设置,原先的权限设置失效。", "move.hint": "移动后,所选知识库/文件夹将继承新文件夹的权限设置,原先的权限设置失效。",
@ -89,6 +88,8 @@
"retain_collection": "调整训练参数", "retain_collection": "调整训练参数",
"retrain_task_submitted": "重新训练任务已提交", "retrain_task_submitted": "重新训练任务已提交",
"same_api_collection": "存在相同的 API 集合", "same_api_collection": "存在相同的 API 集合",
"split_chunk_char": "按指定分割符分块",
"split_chunk_size": "按长度分块",
"split_sign_break": "1 个换行符", "split_sign_break": "1 个换行符",
"split_sign_break2": "2 个换行符", "split_sign_break2": "2 个换行符",
"split_sign_custom": "自定义", "split_sign_custom": "自定义",

View File

@ -568,7 +568,6 @@
"core.dataset.import.Custom process": "自訂規則", "core.dataset.import.Custom process": "自訂規則",
"core.dataset.import.Custom process desc": "自訂設定資料處理規則", "core.dataset.import.Custom process desc": "自訂設定資料處理規則",
"core.dataset.import.Custom prompt": "自訂提示詞", "core.dataset.import.Custom prompt": "自訂提示詞",
"core.dataset.import.Custom split char": "自訂分隔符",
"core.dataset.import.Custom text": "自訂文字", "core.dataset.import.Custom text": "自訂文字",
"core.dataset.import.Custom text desc": "手動輸入一段文字作為資料集", "core.dataset.import.Custom text desc": "手動輸入一段文字作為資料集",
"core.dataset.import.Data process params": "資料處理參數", "core.dataset.import.Data process params": "資料處理參數",

View File

@ -27,7 +27,6 @@
"custom_data_process_params": "自訂", "custom_data_process_params": "自訂",
"custom_data_process_params_desc": "自訂資料處理規則", "custom_data_process_params_desc": "自訂資料處理規則",
"custom_split_sign_tip": "允許你根據自定義的分隔符進行分塊。\n通常用於已處理好的數據使用特定的分隔符來精確分塊。\n可以使用 | 符號表示多個分割符,例如:“。|.” 表示中英文句號。\n\n盡量避免使用正則相關特殊符號例如: * () [] {} 等。", "custom_split_sign_tip": "允許你根據自定義的分隔符進行分塊。\n通常用於已處理好的數據使用特定的分隔符來精確分塊。\n可以使用 | 符號表示多個分割符,例如:“。|.” 表示中英文句號。\n\n盡量避免使用正則相關特殊符號例如: * () [] {} 等。",
"data.ideal_chunk_length": "理想分塊長度",
"data_amount": "{{dataAmount}} 組數據, {{indexAmount}} 組索引", "data_amount": "{{dataAmount}} 組數據, {{indexAmount}} 組索引",
"data_index_num": "索引 {{index}}", "data_index_num": "索引 {{index}}",
"data_process_params": "處理參數", "data_process_params": "處理參數",
@ -53,8 +52,6 @@
"file_model_function_tip": "用於增強索引和問答生成", "file_model_function_tip": "用於增強索引和問答生成",
"filename": "檔案名稱", "filename": "檔案名稱",
"folder_dataset": "資料夾", "folder_dataset": "資料夾",
"ideal_chunk_length": "理想分塊長度",
"ideal_chunk_length_tips": "依結束符號進行分段,並將多個分段組成一個分塊,此值決定了分塊的預估大小,可能會有上下浮動。",
"image_auto_parse": "圖片自動索引", "image_auto_parse": "圖片自動索引",
"image_auto_parse_tips": "調用 VLM 自動標註文檔裡的圖片,並生成額外的檢索索引", "image_auto_parse_tips": "調用 VLM 自動標註文檔裡的圖片,並生成額外的檢索索引",
"image_training_queue": "圖片處理排隊", "image_training_queue": "圖片處理排隊",
@ -68,6 +65,8 @@
"import_param_setting": "參數設置", "import_param_setting": "參數設置",
"import_select_file": "選擇文件", "import_select_file": "選擇文件",
"import_select_link": "輸入鏈接", "import_select_link": "輸入鏈接",
"index_size": "索引大小",
"index_size_tips": "向量化時內容的長度,系統會自動按該大小對分塊進行進一步的分割。",
"is_open_schedule": "啟用定時同步", "is_open_schedule": "啟用定時同步",
"keep_image": "保留圖片", "keep_image": "保留圖片",
"move.hint": "移動後,所選資料集/資料夾將繼承新資料夾的權限設定,原先的權限設定將失效。", "move.hint": "移動後,所選資料集/資料夾將繼承新資料夾的權限設定,原先的權限設定將失效。",
@ -89,6 +88,8 @@
"retain_collection": "調整訓練參數", "retain_collection": "調整訓練參數",
"retrain_task_submitted": "重新訓練任務已提交", "retrain_task_submitted": "重新訓練任務已提交",
"same_api_collection": "存在相同的 API 集合", "same_api_collection": "存在相同的 API 集合",
"split_chunk_char": "按指定分割符分塊",
"split_chunk_size": "按長度分塊",
"split_sign_break": "1 個換行符", "split_sign_break": "1 個換行符",
"split_sign_break2": "2 個換行符", "split_sign_break2": "2 個換行符",
"split_sign_custom": "自定義", "split_sign_custom": "自定義",

View File

@ -71,7 +71,7 @@ const EditResourceModal = ({
{...register('name', { required: true })} {...register('name', { required: true })}
bg={'myGray.50'} bg={'myGray.50'}
autoFocus autoFocus
maxLength={20} maxLength={100}
/> />
</HStack> </HStack>
</Box> </Box>

View File

@ -338,7 +338,7 @@ function EditKeyModal({
<FormLabel flex={'0 0 90px'}>{t('common:Name')}</FormLabel> <FormLabel flex={'0 0 90px'}>{t('common:Name')}</FormLabel>
<Input <Input
placeholder={t('publish:key_alias') || 'key_alias'} placeholder={t('publish:key_alias') || 'key_alias'}
maxLength={20} maxLength={100}
{...register('name', { {...register('name', {
required: t('common:common.name_is_empty') || 'name_is_empty' required: t('common:common.name_is_empty') || 'name_is_empty'
})} })}

View File

@ -117,7 +117,7 @@ function EditModal({
ml={4} ml={4}
autoFocus autoFocus
bg={'myWhite.600'} bg={'myWhite.600'}
maxLength={20} maxLength={100}
placeholder={t('user:team.Team Name')} placeholder={t('user:team.Team Name')}
{...register('name', { {...register('name', {
required: t('common:common.Please Input Name') required: t('common:common.Please Input Name')

View File

@ -326,7 +326,7 @@ function EditLinkModal({
<FormLabel flex={'0 0 90px'}>{t('common:Name')}</FormLabel> <FormLabel flex={'0 0 90px'}>{t('common:Name')}</FormLabel>
<Input <Input
placeholder={t('publish:link_name')} placeholder={t('publish:link_name')}
maxLength={20} maxLength={100}
{...register('name', { {...register('name', {
required: t('common:common.name_is_empty') required: t('common:common.name_is_empty')
})} })}

View File

@ -26,7 +26,7 @@ function BasicInfo({
</FormLabel> </FormLabel>
<Input <Input
placeholder={t('publish:publish_name')} placeholder={t('publish:publish_name')}
maxLength={20} maxLength={100}
{...register('name', { {...register('name', {
required: t('common:common.name_is_empty') required: t('common:common.name_is_empty')
})} })}

View File

@ -96,7 +96,7 @@ const ExtractFieldModal = ({
<Input <Input
bg={'myGray.50'} bg={'myGray.50'}
placeholder="name/age/sql" placeholder="name/age/sql"
maxLength={20} maxLength={100}
{...register('key', { required: true })} {...register('key', { required: true })}
/> />
</Flex> </Flex>

View File

@ -418,7 +418,7 @@ const NodeCard = (props: Props) => {
{RenderToolHandle} {RenderToolHandle}
<ConfirmSyncModal /> <ConfirmSyncModal />
<EditTitleModal maxLength={50} /> <EditTitleModal maxLength={100} />
</Flex> </Flex>
); );
}; };

View File

@ -319,7 +319,7 @@ const TemplateMarketModal = ({
onChange={(e) => setCurrentSearch(e.target.value)} onChange={(e) => setCurrentSearch(e.target.value)}
h={8} h={8}
bg={'myGray.50'} bg={'myGray.50'}
maxLength={20} maxLength={100}
borderRadius={'sm'} borderRadius={'sm'}
/> />
</Box> </Box>

View File

@ -49,7 +49,7 @@ const EditFolderModal = ({
defaultValue={name} defaultValue={name}
placeholder={t('common:dataset.Folder Name') || ''} placeholder={t('common:dataset.Folder Name') || ''}
autoFocus autoFocus
maxLength={20} maxLength={100}
/> />
</ModalBody> </ModalBody>
<ModalFooter> <ModalFooter>

View File

@ -10,11 +10,21 @@ import { useMyStep } from '@fastgpt/web/hooks/useStep';
import { Box, Button, Flex, IconButton } from '@chakra-ui/react'; import { Box, Button, Flex, IconButton } from '@chakra-ui/react';
import MyIcon from '@fastgpt/web/components/common/Icon'; import MyIcon from '@fastgpt/web/components/common/Icon';
import { TabEnum } from '../NavBar'; import { TabEnum } from '../NavBar';
import { ChunkSettingModeEnum } from '@/web/core/dataset/constants'; import { ChunkSettingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { UseFormReturn, useForm } from 'react-hook-form'; import { UseFormReturn, useForm } from 'react-hook-form';
import { ImportSourceItemType } from '@/web/core/dataset/type'; import { ImportSourceItemType } from '@/web/core/dataset/type';
import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent'; import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent';
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext'; import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
import { DataChunkSplitModeEnum } from '@fastgpt/global/core/dataset/constants';
import {
getMaxChunkSize,
getLLMDefaultChunkSize,
getLLMMaxChunkSize,
chunkAutoChunkSize,
minChunkSize,
getAutoIndexSize,
getMaxIndexSize
} from '@fastgpt/global/core/dataset/training/utils';
type TrainingFiledType = { type TrainingFiledType = {
chunkOverlapRatio: number; chunkOverlapRatio: number;
@ -22,6 +32,9 @@ type TrainingFiledType = {
minChunkSize: number; minChunkSize: number;
autoChunkSize: number; autoChunkSize: number;
chunkSize: number; chunkSize: number;
maxIndexSize?: number;
indexSize?: number;
autoIndexSize?: number;
charsPointsPrice: number; charsPointsPrice: number;
priceTip: string; priceTip: string;
uploadRate: number; uploadRate: number;
@ -47,9 +60,13 @@ export type ImportFormType = {
autoIndexes: boolean; autoIndexes: boolean;
chunkSettingMode: ChunkSettingModeEnum; chunkSettingMode: ChunkSettingModeEnum;
chunkSplitMode: DataChunkSplitModeEnum;
embeddingChunkSize: number; embeddingChunkSize: number;
qaChunkSize: number; qaChunkSize: number;
customSplitChar: string; chunkSplitter: string;
indexSize: number;
qaPrompt: string; qaPrompt: string;
webSelector: string; webSelector: string;
}; };
@ -199,9 +216,12 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
trainingType: DatasetCollectionDataProcessModeEnum.chunk, trainingType: DatasetCollectionDataProcessModeEnum.chunk,
chunkSettingMode: ChunkSettingModeEnum.auto, chunkSettingMode: ChunkSettingModeEnum.auto,
embeddingChunkSize: vectorModel?.defaultToken || 512,
qaChunkSize: Math.min(agentModel.maxResponse * 1, agentModel.maxContext * 0.7), chunkSplitMode: DataChunkSplitModeEnum.size,
customSplitChar: '', embeddingChunkSize: 2000,
indexSize: vectorModel?.defaultToken || 512,
qaChunkSize: getLLMDefaultChunkSize(agentModel),
chunkSplitter: '',
qaPrompt: Prompt_AgentQA.description, qaPrompt: Prompt_AgentQA.description,
webSelector: '', webSelector: '',
customPdfParse: false customPdfParse: false
@ -215,17 +235,18 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
const chunkSettingMode = processParamsForm.watch('chunkSettingMode'); const chunkSettingMode = processParamsForm.watch('chunkSettingMode');
const embeddingChunkSize = processParamsForm.watch('embeddingChunkSize'); const embeddingChunkSize = processParamsForm.watch('embeddingChunkSize');
const qaChunkSize = processParamsForm.watch('qaChunkSize'); const qaChunkSize = processParamsForm.watch('qaChunkSize');
const customSplitChar = processParamsForm.watch('customSplitChar'); const chunkSplitter = processParamsForm.watch('chunkSplitter');
const autoIndexes = processParamsForm.watch('autoIndexes'); const autoIndexes = processParamsForm.watch('autoIndexes');
const indexSize = processParamsForm.watch('indexSize');
const TrainingModeMap = useMemo<TrainingFiledType>(() => { const TrainingModeMap = useMemo<TrainingFiledType>(() => {
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) { if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
return { return {
chunkSizeField: 'qaChunkSize', chunkSizeField: 'qaChunkSize',
chunkOverlapRatio: 0, chunkOverlapRatio: 0,
maxChunkSize: Math.min(agentModel.maxResponse * 4, agentModel.maxContext * 0.7), maxChunkSize: getLLMMaxChunkSize(agentModel),
minChunkSize: 4000, minChunkSize: 1000,
autoChunkSize: Math.min(agentModel.maxResponse * 1, agentModel.maxContext * 0.7), autoChunkSize: getLLMDefaultChunkSize(agentModel),
chunkSize: qaChunkSize, chunkSize: qaChunkSize,
charsPointsPrice: agentModel.charsPointsPrice || 0, charsPointsPrice: agentModel.charsPointsPrice || 0,
priceTip: t('dataset:import.Auto mode Estimated Price Tips', { priceTip: t('dataset:import.Auto mode Estimated Price Tips', {
@ -237,10 +258,13 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
return { return {
chunkSizeField: 'embeddingChunkSize', chunkSizeField: 'embeddingChunkSize',
chunkOverlapRatio: 0.2, chunkOverlapRatio: 0.2,
maxChunkSize: 2048, maxChunkSize: getMaxChunkSize(agentModel),
minChunkSize: 100, minChunkSize: minChunkSize,
autoChunkSize: vectorModel?.defaultToken ? vectorModel.defaultToken * 2 : 1024, autoChunkSize: chunkAutoChunkSize,
chunkSize: embeddingChunkSize, chunkSize: embeddingChunkSize,
maxIndexSize: getMaxIndexSize(vectorModel),
autoIndexSize: getAutoIndexSize(vectorModel),
indexSize,
charsPointsPrice: agentModel.charsPointsPrice || 0, charsPointsPrice: agentModel.charsPointsPrice || 0,
priceTip: t('dataset:import.Auto mode Estimated Price Tips', { priceTip: t('dataset:import.Auto mode Estimated Price Tips', {
price: agentModel.charsPointsPrice price: agentModel.charsPointsPrice
@ -251,10 +275,13 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
return { return {
chunkSizeField: 'embeddingChunkSize', chunkSizeField: 'embeddingChunkSize',
chunkOverlapRatio: 0.2, chunkOverlapRatio: 0.2,
maxChunkSize: vectorModel?.maxToken || 512, maxChunkSize: getMaxChunkSize(agentModel),
minChunkSize: 100, minChunkSize: minChunkSize,
autoChunkSize: vectorModel?.defaultToken || 512, autoChunkSize: chunkAutoChunkSize,
chunkSize: embeddingChunkSize, chunkSize: embeddingChunkSize,
maxIndexSize: getMaxIndexSize(vectorModel),
autoIndexSize: getAutoIndexSize(vectorModel),
indexSize,
charsPointsPrice: vectorModel.charsPointsPrice || 0, charsPointsPrice: vectorModel.charsPointsPrice || 0,
priceTip: t('dataset:import.Embedding Estimated Price Tips', { priceTip: t('dataset:import.Embedding Estimated Price Tips', {
price: vectorModel.charsPointsPrice price: vectorModel.charsPointsPrice
@ -265,30 +292,36 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode
}, [ }, [
trainingType, trainingType,
autoIndexes, autoIndexes,
agentModel.maxResponse, agentModel,
agentModel.maxContext,
agentModel.charsPointsPrice,
qaChunkSize, qaChunkSize,
t, t,
vectorModel.defaultToken, embeddingChunkSize,
vectorModel?.maxToken, vectorModel,
vectorModel.charsPointsPrice, indexSize
embeddingChunkSize
]); ]);
const chunkSettingModeMap = useMemo(() => { const chunkSettingModeMap = useMemo(() => {
if (chunkSettingMode === ChunkSettingModeEnum.auto) { if (chunkSettingMode === ChunkSettingModeEnum.auto) {
return { return {
chunkSize: TrainingModeMap.autoChunkSize, chunkSize: TrainingModeMap.autoChunkSize,
customSplitChar: '' indexSize: TrainingModeMap.autoIndexSize,
chunkSplitter: ''
}; };
} else { } else {
return { return {
chunkSize: TrainingModeMap.chunkSize, chunkSize: TrainingModeMap.chunkSize,
customSplitChar indexSize: TrainingModeMap.indexSize,
chunkSplitter
}; };
} }
}, [chunkSettingMode, TrainingModeMap.autoChunkSize, TrainingModeMap.chunkSize, customSplitChar]); }, [
chunkSettingMode,
TrainingModeMap.autoChunkSize,
TrainingModeMap.autoIndexSize,
TrainingModeMap.chunkSize,
TrainingModeMap.indexSize,
chunkSplitter
]);
const contextValue = { const contextValue = {
...TrainingModeMap, ...TrainingModeMap,

View File

@ -20,10 +20,11 @@ import MyIcon from '@fastgpt/web/components/common/Icon';
import { useTranslation } from 'next-i18next'; import { useTranslation } from 'next-i18next';
import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio'; import LeftRadio from '@fastgpt/web/components/common/Radio/LeftRadio';
import { import {
DataChunkSplitModeEnum,
DatasetCollectionDataProcessModeEnum, DatasetCollectionDataProcessModeEnum,
DatasetCollectionDataProcessModeMap DatasetCollectionDataProcessModeMap
} from '@fastgpt/global/core/dataset/constants'; } from '@fastgpt/global/core/dataset/constants';
import { ChunkSettingModeEnum } from '@/web/core/dataset/constants'; import { ChunkSettingModeEnum } from '@fastgpt/global/core/dataset/constants';
import MyTooltip from '@fastgpt/web/components/common/MyTooltip'; import MyTooltip from '@fastgpt/web/components/common/MyTooltip';
import { useSystemStore } from '@/web/common/system/useSystemStore'; import { useSystemStore } from '@/web/common/system/useSystemStore';
import MyModal from '@fastgpt/web/components/common/MyModal'; import MyModal from '@fastgpt/web/components/common/MyModal';
@ -37,25 +38,39 @@ import QuestionTip from '@fastgpt/web/components/common/MyTooltip/QuestionTip';
import { shadowLight } from '@fastgpt/web/styles/theme'; import { shadowLight } from '@fastgpt/web/styles/theme';
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext'; import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
import MySelect from '@fastgpt/web/components/common/MySelect'; import MySelect from '@fastgpt/web/components/common/MySelect';
import { getIndexSizeSelectList } from '@fastgpt/global/core/dataset/training/utils';
import RadioGroup from '@fastgpt/web/components/common/Radio/RadioGroup';
function DataProcess() { function DataProcess() {
const { t } = useTranslation(); const { t } = useTranslation();
const { feConfigs } = useSystemStore(); const { feConfigs } = useSystemStore();
const { goToNext, processParamsForm, chunkSizeField, minChunkSize, maxChunkSize } = const {
useContextSelector(DatasetImportContext, (v) => v); goToNext,
processParamsForm,
chunkSizeField,
minChunkSize,
maxChunkSize,
maxIndexSize,
indexSize
} = useContextSelector(DatasetImportContext, (v) => v);
const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail); const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail);
const { setValue, register, watch, getValues } = processParamsForm; const { setValue, register, watch, getValues } = processParamsForm;
const trainingType = watch('trainingType'); const trainingType = watch('trainingType');
const chunkSettingMode = watch('chunkSettingMode'); const trainingModeList = useMemo(() => {
const list = Object.entries(DatasetCollectionDataProcessModeMap);
return list
.filter(([key]) => key !== DatasetCollectionDataProcessModeEnum.auto)
.map(([key, value]) => ({
title: t(value.label as any),
value: key as DatasetCollectionDataProcessModeEnum,
tooltip: t(value.tooltip as any)
}));
}, [t]);
const qaPrompt = watch('qaPrompt'); const chunkSettingMode = watch('chunkSettingMode');
const { const chunkSplitMode = watch('chunkSplitMode');
isOpen: isOpenCustomPrompt,
onOpen: onOpenCustomPrompt,
onClose: onCloseCustomPrompt
} = useDisclosure();
const customSplitList = [ const customSplitList = [
{ label: t('dataset:split_sign_null'), value: '' }, { label: t('dataset:split_sign_null'), value: '' },
@ -69,25 +84,25 @@ function DataProcess() {
{ label: t('dataset:split_sign_custom'), value: 'Other' } { label: t('dataset:split_sign_custom'), value: 'Other' }
]; ];
const [customListSelectValue, setCustomListSelectValue] = useState(getValues('customSplitChar')); const [customListSelectValue, setCustomListSelectValue] = useState(getValues('chunkSplitter'));
useEffect(() => { useEffect(() => {
if (customListSelectValue === 'Other') { if (customListSelectValue === 'Other') {
setValue('customSplitChar', ''); setValue('chunkSplitter', '');
} else { } else {
setValue('customSplitChar', customListSelectValue); setValue('chunkSplitter', customListSelectValue);
} }
}, [customListSelectValue, setValue]); }, [customListSelectValue, setValue]);
const trainingModeList = useMemo(() => { // Index size
const list = Object.entries(DatasetCollectionDataProcessModeMap); const indexSizeSeletorList = useMemo(() => getIndexSizeSelectList(maxIndexSize), [maxIndexSize]);
return list
.filter(([key]) => key !== DatasetCollectionDataProcessModeEnum.auto) // QA
.map(([key, value]) => ({ const qaPrompt = watch('qaPrompt');
title: t(value.label as any), const {
value: key as DatasetCollectionDataProcessModeEnum, isOpen: isOpenCustomPrompt,
tooltip: t(value.tooltip as any) onOpen: onOpenCustomPrompt,
})); onClose: onCloseCustomPrompt
}, [t]); } = useDisclosure();
const Title = useCallback(({ title }: { title: string }) => { const Title = useCallback(({ title }: { title: string }) => {
return ( return (
@ -237,67 +252,97 @@ function DataProcess() {
children: chunkSettingMode === ChunkSettingModeEnum.custom && ( children: chunkSettingMode === ChunkSettingModeEnum.custom && (
<Box mt={5}> <Box mt={5}>
<Box> <Box>
<Flex alignItems={'center'}> <RadioGroup<DataChunkSplitModeEnum>
<Box>{t('dataset:ideal_chunk_length')}</Box> list={[
<QuestionTip label={t('dataset:ideal_chunk_length_tips')} /> {
</Flex> title: t('dataset:split_chunk_size'),
<Box value: DataChunkSplitModeEnum.size
mt={1} },
css={{ {
'& > span': { title: t('dataset:split_chunk_char'),
display: 'block' value: DataChunkSplitModeEnum.char,
tooltip: t('dataset:custom_split_sign_tip')
} }
]}
value={chunkSplitMode}
onChange={(e) => {
setValue('chunkSplitMode', e);
}} }}
> />
<MyTooltip
label={t('common:core.dataset.import.Chunk Range', { {chunkSplitMode === DataChunkSplitModeEnum.size && (
min: minChunkSize, <Box
max: maxChunkSize mt={1.5}
})} css={{
'& > span': {
display: 'block'
}
}}
> >
<MyNumberInput <MyTooltip
register={register} label={t('common:core.dataset.import.Chunk Range', {
name={chunkSizeField} min: minChunkSize,
min={minChunkSize} max: maxChunkSize
max={maxChunkSize} })}
size={'sm'} >
step={100} <MyNumberInput
/> register={register}
</MyTooltip> name={chunkSizeField}
</Box> min={minChunkSize}
max={maxChunkSize}
size={'sm'}
step={100}
/>
</MyTooltip>
</Box>
)}
{chunkSplitMode === DataChunkSplitModeEnum.char && (
<HStack mt={1.5}>
<Box flex={'1 0 0'}>
<MySelect<string>
list={customSplitList}
size={'sm'}
bg={'myGray.50'}
value={customListSelectValue}
h={'32px'}
onChange={(val) => {
setCustomListSelectValue(val);
}}
/>
</Box>
{customListSelectValue === 'Other' && (
<Input
flex={'1 0 0'}
h={'32px'}
size={'sm'}
bg={'myGray.50'}
placeholder="\n;======;==SPLIT=="
{...register('chunkSplitter')}
/>
)}
</HStack>
)}
</Box> </Box>
<Box mt={3}> {trainingType === DatasetCollectionDataProcessModeEnum.chunk && (
<Box> <Box>
{t('common:core.dataset.import.Custom split char')} <Flex alignItems={'center'} mt={3}>
<QuestionTip label={t('dataset:custom_split_sign_tip')} /> <Box>{t('dataset:index_size')}</Box>
</Box> <QuestionTip label={t('dataset:index_size_tips')} />
</Flex>
<HStack mt={1}> <Box mt={1}>
<Box flex={'1 0 0'}> <MySelect<number>
<MySelect<string>
list={customSplitList}
size={'sm'}
bg={'myGray.50'} bg={'myGray.50'}
value={customListSelectValue} list={indexSizeSeletorList}
h={'32px'} value={indexSize}
onChange={(val) => { onChange={(val) => {
setCustomListSelectValue(val); setValue('indexSize', val);
}} }}
/> />
</Box> </Box>
{customListSelectValue === 'Other' && ( </Box>
<Input )}
flex={'1 0 0'}
h={'32px'}
size={'sm'}
bg={'myGray.50'}
placeholder="\n;======;==SPLIT=="
{...register('customSplitChar')}
/>
)}
</HStack>
</Box>
{showQAPromptInput && ( {showQAPromptInput && (
<Box mt={3}> <Box mt={3}>

View File

@ -16,6 +16,7 @@ import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContex
import MyBox from '@fastgpt/web/components/common/MyBox'; import MyBox from '@fastgpt/web/components/common/MyBox';
import Markdown from '@/components/Markdown'; import Markdown from '@/components/Markdown';
import { useToast } from '@fastgpt/web/hooks/useToast'; import { useToast } from '@fastgpt/web/hooks/useToast';
import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
const PreviewData = () => { const PreviewData = () => {
const { t } = useTranslation(); const { t } = useTranslation();
@ -23,6 +24,7 @@ const PreviewData = () => {
const goToNext = useContextSelector(DatasetImportContext, (v) => v.goToNext); const goToNext = useContextSelector(DatasetImportContext, (v) => v.goToNext);
const datasetId = useContextSelector(DatasetPageContext, (v) => v.datasetId); const datasetId = useContextSelector(DatasetPageContext, (v) => v.datasetId);
const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail);
const sources = useContextSelector(DatasetImportContext, (v) => v.sources); const sources = useContextSelector(DatasetImportContext, (v) => v.sources);
const importSource = useContextSelector(DatasetImportContext, (v) => v.importSource); const importSource = useContextSelector(DatasetImportContext, (v) => v.importSource);
@ -36,12 +38,13 @@ const PreviewData = () => {
async () => { async () => {
if (!previewFile) return; if (!previewFile) return;
if (importSource === ImportDataSourceEnum.fileCustom) { if (importSource === ImportDataSourceEnum.fileCustom) {
const customSplitChar = processParamsForm.getValues('customSplitChar'); const chunkSplitter = processParamsForm.getValues('chunkSplitter');
const { chunks } = splitText2Chunks({ const { chunks } = splitText2Chunks({
text: previewFile.rawText || '', text: previewFile.rawText || '',
chunkLen: chunkSize, chunkSize,
maxSize: getLLMMaxChunkSize(datasetDetail.agentModel),
overlapRatio: chunkOverlapRatio, overlapRatio: chunkOverlapRatio,
customReg: customSplitChar ? [customSplitChar] : [] customReg: chunkSplitter ? [chunkSplitter] : []
}); });
return chunks.map((chunk) => ({ return chunks.map((chunk) => ({
q: chunk, q: chunk,
@ -61,9 +64,12 @@ const PreviewData = () => {
customPdfParse: processParamsForm.getValues('customPdfParse'), customPdfParse: processParamsForm.getValues('customPdfParse'),
trainingType: processParamsForm.getValues('trainingType'),
chunkSettingMode: processParamsForm.getValues('chunkSettingMode'),
chunkSplitMode: processParamsForm.getValues('chunkSplitMode'),
chunkSize, chunkSize,
chunkSplitter: processParamsForm.getValues('chunkSplitter'),
overlapRatio: chunkOverlapRatio, overlapRatio: chunkOverlapRatio,
customSplitChar: processParamsForm.getValues('customSplitChar'),
selector: processParamsForm.getValues('webSelector'), selector: processParamsForm.getValues('webSelector'),
isQAImport: importSource === ImportDataSourceEnum.csvTable, isQAImport: importSource === ImportDataSourceEnum.csvTable,

View File

@ -49,7 +49,7 @@ const Upload = () => {
const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail); const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail);
const retrainNewCollectionId = useRef(''); const retrainNewCollectionId = useRef('');
const { importSource, parentId, sources, setSources, processParamsForm, chunkSize } = const { importSource, parentId, sources, setSources, processParamsForm, chunkSize, indexSize } =
useContextSelector(DatasetImportContext, (v) => v); useContextSelector(DatasetImportContext, (v) => v);
const { handleSubmit } = processParamsForm; const { handleSubmit } = processParamsForm;
@ -81,7 +81,7 @@ const Upload = () => {
}, [waitingFilesCount, totalFilesCount, allFinished, t]); }, [waitingFilesCount, totalFilesCount, allFinished, t]);
const { runAsync: startUpload, loading: isLoading } = useRequest2( const { runAsync: startUpload, loading: isLoading } = useRequest2(
async ({ trainingType, customSplitChar, qaPrompt, webSelector }: ImportFormType) => { async ({ trainingType, chunkSplitter, qaPrompt, webSelector }: ImportFormType) => {
if (sources.length === 0) return; if (sources.length === 0) return;
const filterWaitingSources = sources.filter((item) => item.createStatus === 'waiting'); const filterWaitingSources = sources.filter((item) => item.createStatus === 'waiting');
@ -111,10 +111,16 @@ const Upload = () => {
trainingType, trainingType,
imageIndex: processParamsForm.getValues('imageIndex'), imageIndex: processParamsForm.getValues('imageIndex'),
autoIndexes: processParamsForm.getValues('autoIndexes'), autoIndexes: processParamsForm.getValues('autoIndexes'),
chunkSettingMode: processParamsForm.getValues('chunkSettingMode'),
chunkSplitMode: processParamsForm.getValues('chunkSplitMode'),
chunkSize, chunkSize,
chunkSplitter: customSplitChar, indexSize,
chunkSplitter,
qaPrompt: trainingType === DatasetCollectionDataProcessModeEnum.qa ? qaPrompt : undefined qaPrompt: trainingType === DatasetCollectionDataProcessModeEnum.qa ? qaPrompt : undefined
}; };
if (importSource === ImportDataSourceEnum.reTraining) { if (importSource === ImportDataSourceEnum.reTraining) {
const res = await postReTrainingDatasetFileCollection({ const res = await postReTrainingDatasetFileCollection({
...commonParams, ...commonParams,

View File

@ -1,102 +0,0 @@
import React from 'react';
import { Box } from '@chakra-ui/react';
import { ImportSourceItemType } from '@/web/core/dataset/type';
import MyRightDrawer from '@fastgpt/web/components/common/MyDrawer/MyRightDrawer';
import { getPreviewChunks } from '@/web/core/dataset/api';
import { ImportDataSourceEnum } from '@fastgpt/global/core/dataset/constants';
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { useContextSelector } from 'use-context-selector';
import { DatasetImportContext } from '../Context';
import { useRequest2 } from '@fastgpt/web/hooks/useRequest';
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
import { getPreviewSourceReadType } from '../utils';
const PreviewChunks = ({
previewSource,
onClose
}: {
previewSource: ImportSourceItemType;
onClose: () => void;
}) => {
const { importSource, chunkSize, chunkOverlapRatio, processParamsForm } = useContextSelector(
DatasetImportContext,
(v) => v
);
const datasetId = useContextSelector(DatasetPageContext, (v) => v.datasetId);
const { data = [], loading: isLoading } = useRequest2(
async () => {
if (importSource === ImportDataSourceEnum.fileCustom) {
const customSplitChar = processParamsForm.getValues('customSplitChar');
const { chunks } = splitText2Chunks({
text: previewSource.rawText || '',
chunkLen: chunkSize,
overlapRatio: chunkOverlapRatio,
customReg: customSplitChar ? [customSplitChar] : []
});
return chunks.map((chunk) => ({
q: chunk,
a: ''
}));
}
return getPreviewChunks({
datasetId,
type: getPreviewSourceReadType(previewSource),
sourceId:
previewSource.dbFileId ||
previewSource.link ||
previewSource.externalFileUrl ||
previewSource.apiFileId ||
'',
chunkSize,
overlapRatio: chunkOverlapRatio,
customSplitChar: processParamsForm.getValues('customSplitChar'),
selector: processParamsForm.getValues('webSelector'),
isQAImport: importSource === ImportDataSourceEnum.csvTable,
externalFileId: previewSource.externalFileId
});
},
{
manual: false
}
);
return (
<MyRightDrawer
onClose={onClose}
iconSrc={previewSource.icon}
title={previewSource.sourceName}
isLoading={isLoading}
maxW={['90vw', '40vw']}
px={0}
>
<Box overflowY={'auto'} px={5} fontSize={'sm'}>
{data.map((item, index) => (
<Box
key={index}
whiteSpace={'pre-wrap'}
fontSize={'sm'}
p={4}
bg={index % 2 === 0 ? 'white' : 'myWhite.600'}
mb={3}
borderRadius={'md'}
borderWidth={'1px'}
borderColor={'borderColor.low'}
boxShadow={'2'}
_notLast={{
mb: 2
}}
>
<Box color={'myGray.900'}>{item.q}</Box>
<Box color={'myGray.500'}>{item.a}</Box>
</Box>
))}
</Box>
</MyRightDrawer>
);
};
export default React.memo(PreviewChunks);

View File

@ -8,10 +8,11 @@ import { useRouter } from 'next/router';
import { useRequest2 } from '@fastgpt/web/hooks/useRequest'; import { useRequest2 } from '@fastgpt/web/hooks/useRequest';
import { getDatasetCollectionById } from '@/web/core/dataset/api'; import { getDatasetCollectionById } from '@/web/core/dataset/api';
import MyBox from '@fastgpt/web/components/common/MyBox'; import MyBox from '@fastgpt/web/components/common/MyBox';
import { ChunkSettingModeEnum } from '@/web/core/dataset/constants'; import { ChunkSettingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { getCollectionIcon } from '@fastgpt/global/core/dataset/utils'; import { getCollectionIcon } from '@fastgpt/global/core/dataset/utils';
import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext';
import { Box } from '@chakra-ui/react'; import { Box } from '@chakra-ui/react';
import { DataChunkSplitModeEnum } from '@fastgpt/global/core/dataset/constants';
import { Prompt_AgentQA } from '@fastgpt/global/core/ai/prompt/agent';
const Upload = dynamic(() => import('../commonProgress/Upload')); const Upload = dynamic(() => import('../commonProgress/Upload'));
const PreviewData = dynamic(() => import('../commonProgress/PreviewData')); const PreviewData = dynamic(() => import('../commonProgress/PreviewData'));
@ -23,7 +24,6 @@ const ReTraining = () => {
collectionId: string; collectionId: string;
}; };
const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail);
const activeStep = useContextSelector(DatasetImportContext, (v) => v.activeStep); const activeStep = useContextSelector(DatasetImportContext, (v) => v.activeStep);
const setSources = useContextSelector(DatasetImportContext, (v) => v.setSources); const setSources = useContextSelector(DatasetImportContext, (v) => v.setSources);
const processParamsForm = useContextSelector(DatasetImportContext, (v) => v.processParamsForm); const processParamsForm = useContextSelector(DatasetImportContext, (v) => v.processParamsForm);
@ -46,18 +46,21 @@ const ReTraining = () => {
uploadedFileRate: 100 uploadedFileRate: 100
} }
]); ]);
processParamsForm.reset({ processParamsForm.reset({
customPdfParse: collection.customPdfParse, customPdfParse: collection.customPdfParse,
trainingType: collection.trainingType, trainingType: collection.trainingType,
imageIndex: collection.imageIndex, imageIndex: collection.imageIndex,
autoIndexes: collection.autoIndexes, autoIndexes: collection.autoIndexes,
chunkSettingMode: ChunkSettingModeEnum.auto, chunkSettingMode: collection.chunkSettingMode || ChunkSettingModeEnum.auto,
chunkSplitMode: collection.chunkSplitMode || DataChunkSplitModeEnum.size,
embeddingChunkSize: collection.chunkSize, embeddingChunkSize: collection.chunkSize,
qaChunkSize: collection.chunkSize, qaChunkSize: collection.chunkSize,
customSplitChar: collection.chunkSplitter, indexSize: collection.indexSize || 512,
qaPrompt: collection.qaPrompt, chunkSplitter: collection.chunkSplitter,
webSelector: collection.metadata?.webPageSelector webSelector: collection.metadata?.webPageSelector,
qaPrompt: collection.qaPrompt || Prompt_AgentQA.description
}); });
} }
}); });

View File

@ -294,7 +294,7 @@ const MyInfo = ({ onOpenContact }: { onOpenContact: () => void }) => {
title={t('account_info:click_modify_nickname')} title={t('account_info:click_modify_nickname')}
borderColor={'transparent'} borderColor={'transparent'}
transform={'translateX(-11px)'} transform={'translateX(-11px)'}
maxLength={20} maxLength={100}
onBlur={async (e) => { onBlur={async (e) => {
const val = e.target.value; const val = e.target.value;
if (val === userInfo?.team?.memberName) return; if (val === userInfo?.team?.memberName) return;

View File

@ -2,8 +2,7 @@ import { reTrainingDatasetFileCollectionParams } from '@fastgpt/global/core/data
import { createCollectionAndInsertData } from '@fastgpt/service/core/dataset/collection/controller'; import { createCollectionAndInsertData } from '@fastgpt/service/core/dataset/collection/controller';
import { import {
DatasetCollectionTypeEnum, DatasetCollectionTypeEnum,
DatasetSourceReadTypeEnum, DatasetSourceReadTypeEnum
TrainingModeEnum
} from '@fastgpt/global/core/dataset/constants'; } from '@fastgpt/global/core/dataset/constants';
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun'; import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
import { hashStr } from '@fastgpt/global/common/string/tools'; import { hashStr } from '@fastgpt/global/common/string/tools';

View File

@ -4,7 +4,7 @@
*/ */
import type { NextApiRequest } from 'next'; import type { NextApiRequest } from 'next';
import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken/index'; import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken/index';
import { getEmbeddingModel } from '@fastgpt/service/core/ai/model'; import { getEmbeddingModel, getLLMModel } from '@fastgpt/service/core/ai/model';
import { hasSameValue } from '@/service/core/dataset/data/utils'; import { hasSameValue } from '@/service/core/dataset/data/utils';
import { insertData2Dataset } from '@/service/core/dataset/data/controller'; import { insertData2Dataset } from '@/service/core/dataset/data/controller';
import { authDatasetCollection } from '@fastgpt/service/support/permission/dataset/auth'; import { authDatasetCollection } from '@fastgpt/service/support/permission/dataset/auth';
@ -16,6 +16,7 @@ import { checkDatasetLimit } from '@fastgpt/service/support/permission/teamLimit
import { NextAPI } from '@/service/middleware/entry'; import { NextAPI } from '@/service/middleware/entry';
import { WritePermissionVal } from '@fastgpt/global/support/permission/constant'; import { WritePermissionVal } from '@fastgpt/global/support/permission/constant';
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common'; import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
async function handler(req: NextApiRequest) { async function handler(req: NextApiRequest) {
const { collectionId, q, a, indexes } = req.body as InsertOneDatasetDataProps; const { collectionId, q, a, indexes } = req.body as InsertOneDatasetDataProps;
@ -45,7 +46,7 @@ async function handler(req: NextApiRequest) {
// auth collection and get dataset // auth collection and get dataset
const [ const [
{ {
dataset: { _id: datasetId, vectorModel } dataset: { _id: datasetId, vectorModel, agentModel }
} }
] = await Promise.all([getCollectionWithDataset(collectionId)]); ] = await Promise.all([getCollectionWithDataset(collectionId)]);
@ -60,9 +61,11 @@ async function handler(req: NextApiRequest) {
// token check // token check
const token = await countPromptTokens(formatQ + formatA, ''); const token = await countPromptTokens(formatQ + formatA, '');
const vectorModelData = getEmbeddingModel(vectorModel); const vectorModelData = getEmbeddingModel(vectorModel);
const llmModelData = getLLMModel(agentModel);
const maxChunkSize = getLLMMaxChunkSize(llmModelData);
if (token > vectorModelData.maxToken) { if (token > maxChunkSize) {
return Promise.reject('Q Over Tokens'); return Promise.reject(`Content over max chunk size: ${maxChunkSize}`);
} }
// Duplicate data check // Duplicate data check
@ -82,7 +85,7 @@ async function handler(req: NextApiRequest) {
q: formatQ, q: formatQ,
a: formatA, a: formatA,
chunkIndex: 0, chunkIndex: 0,
model: vectorModelData.model, embeddingModel: vectorModelData.model,
indexes: formatIndexes indexes: formatIndexes
}); });

View File

@ -1,4 +1,9 @@
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants'; import {
ChunkSettingModeEnum,
DataChunkSplitModeEnum,
DatasetCollectionDataProcessModeEnum,
DatasetSourceReadTypeEnum
} from '@fastgpt/global/core/dataset/constants';
import { rawText2Chunks, readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read'; import { rawText2Chunks, readDatasetSourceRawText } from '@fastgpt/service/core/dataset/read';
import { NextAPI } from '@/service/middleware/entry'; import { NextAPI } from '@/service/middleware/entry';
import { ApiRequestProps } from '@fastgpt/service/type/next'; import { ApiRequestProps } from '@fastgpt/service/type/next';
@ -8,17 +13,30 @@ import {
} from '@fastgpt/global/support/permission/constant'; } from '@fastgpt/global/support/permission/constant';
import { authCollectionFile } from '@fastgpt/service/support/permission/auth/file'; import { authCollectionFile } from '@fastgpt/service/support/permission/auth/file';
import { authDataset } from '@fastgpt/service/support/permission/dataset/auth'; import { authDataset } from '@fastgpt/service/support/permission/dataset/auth';
import {
computeChunkSize,
computeChunkSplitter,
getLLMMaxChunkSize
} from '@fastgpt/global/core/dataset/training/utils';
import { CommonErrEnum } from '@fastgpt/global/common/error/code/common';
import { getLLMModel } from '@fastgpt/service/core/ai/model';
export type PostPreviewFilesChunksProps = { export type PostPreviewFilesChunksProps = {
datasetId: string; datasetId: string;
type: DatasetSourceReadTypeEnum; type: DatasetSourceReadTypeEnum;
sourceId: string; sourceId: string;
chunkSize: number;
overlapRatio: number;
customSplitChar?: string;
customPdfParse?: boolean; customPdfParse?: boolean;
trainingType: DatasetCollectionDataProcessModeEnum;
// Chunk settings
chunkSettingMode: ChunkSettingModeEnum;
chunkSplitMode: DataChunkSplitModeEnum;
chunkSize: number;
chunkSplitter?: string;
overlapRatio: number;
// Read params // Read params
selector?: string; selector?: string;
isQAImport?: boolean; isQAImport?: boolean;
@ -32,55 +50,64 @@ export type PreviewChunksResponse = {
async function handler( async function handler(
req: ApiRequestProps<PostPreviewFilesChunksProps> req: ApiRequestProps<PostPreviewFilesChunksProps>
): Promise<PreviewChunksResponse> { ): Promise<PreviewChunksResponse> {
const { let {
type, type,
sourceId, sourceId,
customPdfParse = false,
trainingType,
chunkSettingMode,
chunkSplitMode,
chunkSize, chunkSize,
customSplitChar, chunkSplitter,
overlapRatio, overlapRatio,
selector, selector,
isQAImport, isQAImport,
datasetId, datasetId,
externalFileId, externalFileId
customPdfParse = false
} = req.body; } = req.body;
if (!sourceId) { if (!sourceId) {
throw new Error('sourceId is empty'); throw new Error('sourceId is empty');
} }
if (chunkSize > 30000) {
throw new Error('chunkSize is too large, should be less than 30000'); const fileAuthRes =
type === DatasetSourceReadTypeEnum.fileLocal
? await authCollectionFile({
req,
authToken: true,
authApiKey: true,
fileId: sourceId,
per: OwnerPermissionVal
})
: undefined;
const { dataset, teamId, tmbId } = await authDataset({
req,
authApiKey: true,
authToken: true,
datasetId,
per: WritePermissionVal
});
if (fileAuthRes && (String(fileAuthRes.tmbId) !== String(tmbId) || !fileAuthRes.isRoot)) {
return Promise.reject(CommonErrEnum.unAuthFile);
} }
const { teamId, tmbId, apiServer, feishuServer, yuqueServer } = await (async () => { chunkSize = computeChunkSize({
if (type === DatasetSourceReadTypeEnum.fileLocal) { trainingType,
const res = await authCollectionFile({ chunkSettingMode,
req, chunkSplitMode,
authToken: true, chunkSize,
authApiKey: true, llmModel: getLLMModel(dataset.agentModel)
fileId: sourceId, });
per: OwnerPermissionVal
}); chunkSplitter = computeChunkSplitter({
return { chunkSettingMode,
teamId: res.teamId, chunkSplitMode,
tmbId: res.tmbId chunkSplitter
}; });
}
const { dataset, teamId, tmbId } = await authDataset({
req,
authApiKey: true,
authToken: true,
datasetId,
per: WritePermissionVal
});
return {
teamId,
tmbId,
apiServer: dataset.apiServer,
feishuServer: dataset.feishuServer,
yuqueServer: dataset.yuqueServer
};
})();
const { rawText } = await readDatasetSourceRawText({ const { rawText } = await readDatasetSourceRawText({
teamId, teamId,
@ -89,18 +116,19 @@ async function handler(
sourceId, sourceId,
selector, selector,
isQAImport, isQAImport,
apiServer, apiServer: dataset.apiServer,
feishuServer, feishuServer: dataset.feishuServer,
yuqueServer, yuqueServer: dataset.yuqueServer,
externalFileId, externalFileId,
customPdfParse customPdfParse
}); });
return rawText2Chunks({ return rawText2Chunks({
rawText, rawText,
chunkLen: chunkSize, chunkSize,
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
overlapRatio, overlapRatio,
customReg: customSplitChar ? [customSplitChar] : [], customReg: chunkSplitter ? [chunkSplitter] : [],
isQAImport: isQAImport isQAImport: isQAImport
}).slice(0, 10); }).slice(0, 10);
} }

View File

@ -5,25 +5,63 @@ import {
UpdateDatasetDataProps UpdateDatasetDataProps
} from '@fastgpt/global/core/dataset/controller'; } from '@fastgpt/global/core/dataset/controller';
import { insertDatasetDataVector } from '@fastgpt/service/common/vectorStore/controller'; import { insertDatasetDataVector } from '@fastgpt/service/common/vectorStore/controller';
import { getDefaultIndex } from '@fastgpt/global/core/dataset/utils';
import { jiebaSplit } from '@fastgpt/service/common/string/jieba/index'; import { jiebaSplit } from '@fastgpt/service/common/string/jieba/index';
import { deleteDatasetDataVector } from '@fastgpt/service/common/vectorStore/controller'; import { deleteDatasetDataVector } from '@fastgpt/service/common/vectorStore/controller';
import { DatasetDataIndexItemType, DatasetDataItemType } from '@fastgpt/global/core/dataset/type'; import { DatasetDataIndexItemType, DatasetDataItemType } from '@fastgpt/global/core/dataset/type';
import { getEmbeddingModel } from '@fastgpt/service/core/ai/model'; import { getEmbeddingModel, getLLMModel } from '@fastgpt/service/core/ai/model';
import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun'; import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun';
import { ClientSession } from '@fastgpt/service/common/mongo'; import { ClientSession } from '@fastgpt/service/common/mongo';
import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTextSchema'; import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTextSchema';
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants'; import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
import { countPromptTokens } from '@fastgpt/service/common/string/tiktoken';
import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils';
const formatIndexes = ({ const formatIndexes = async ({
indexes, indexes,
q, q,
a = '' a = '',
indexSize
}: { }: {
indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & { dataId?: string })[]; indexes?: (Omit<DatasetDataIndexItemType, 'dataId'> & { dataId?: string })[];
q: string; q: string;
a?: string; a?: string;
}) => { indexSize: number;
}): Promise<
{
type: `${DatasetDataIndexTypeEnum}`;
text: string;
dataId?: string;
}[]
> => {
/* get dataset data default index */
const getDefaultIndex = ({
q = '',
a,
indexSize
}: {
q?: string;
a?: string;
indexSize: number;
}) => {
const qChunks = splitText2Chunks({
text: q,
chunkSize: indexSize
}).chunks;
const aChunks = a ? splitText2Chunks({ text: a, chunkSize: indexSize }).chunks : [];
return [
...qChunks.map((text) => ({
text,
type: DatasetDataIndexTypeEnum.default
})),
...aChunks.map((text) => ({
text,
type: DatasetDataIndexTypeEnum.default
}))
];
};
indexes = indexes || []; indexes = indexes || [];
// If index not type, set it to custom // If index not type, set it to custom
indexes = indexes indexes = indexes
@ -35,7 +73,7 @@ const formatIndexes = ({
.filter((item) => !!item.text.trim()); .filter((item) => !!item.text.trim());
// Recompute default indexes, Merge ids of the same index, reduce the number of rebuilds // Recompute default indexes, Merge ids of the same index, reduce the number of rebuilds
const defaultIndexes = getDefaultIndex({ q, a }); const defaultIndexes = getDefaultIndex({ q, a, indexSize });
const concatDefaultIndexes = defaultIndexes.map((item) => { const concatDefaultIndexes = defaultIndexes.map((item) => {
const oldIndex = indexes!.find((index) => index.text === item.text); const oldIndex = indexes!.find((index) => index.text === item.text);
if (oldIndex) { if (oldIndex) {
@ -56,11 +94,24 @@ const formatIndexes = ({
(item, index, self) => index === self.findIndex((t) => t.text === item.text) (item, index, self) => index === self.findIndex((t) => t.text === item.text)
); );
return indexes.map((index) => ({ const chekcIndexes = (
type: index.type, await Promise.all(
text: index.text, indexes.map(async (item) => {
dataId: index.dataId // If oversize tokens, split it
})); const tokens = await countPromptTokens(item.text);
if (tokens > indexSize) {
const splitText = splitText2Chunks({ text: item.text, chunkSize: 512 }).chunks;
return splitText.map((text) => ({
text,
type: item.type
}));
}
return item;
})
)
).flat();
return chekcIndexes;
}; };
/* insert data. /* insert data.
* 1. create data id * 1. create data id
@ -75,30 +126,40 @@ export async function insertData2Dataset({
q, q,
a = '', a = '',
chunkIndex = 0, chunkIndex = 0,
indexSize = 512,
indexes, indexes,
model, embeddingModel,
session session
}: CreateDatasetDataProps & { }: CreateDatasetDataProps & {
model: string; embeddingModel: string;
indexSize?: number;
session?: ClientSession; session?: ClientSession;
}) { }) {
if (!q || !datasetId || !collectionId || !model) { if (!q || !datasetId || !collectionId || !embeddingModel) {
return Promise.reject('q, datasetId, collectionId, model is required'); return Promise.reject('q, datasetId, collectionId, embeddingModel is required');
} }
if (String(teamId) === String(tmbId)) { if (String(teamId) === String(tmbId)) {
return Promise.reject("teamId and tmbId can't be the same"); return Promise.reject("teamId and tmbId can't be the same");
} }
const embModel = getEmbeddingModel(embeddingModel);
indexSize = Math.min(embModel.maxToken, indexSize);
// 1. Get vector indexes and insert // 1. Get vector indexes and insert
// Empty indexes check, if empty, create default index // Empty indexes check, if empty, create default index
const newIndexes = formatIndexes({ indexes, q, a }); const newIndexes = await formatIndexes({
indexes,
q,
a,
indexSize
});
// insert to vector store // insert to vector store
const result = await Promise.all( const result = await Promise.all(
newIndexes.map(async (item) => { newIndexes.map(async (item) => {
const result = await insertDatasetDataVector({ const result = await insertDatasetDataVector({
query: item.text, query: item.text,
model: getEmbeddingModel(model), model: embModel,
teamId, teamId,
datasetId, datasetId,
collectionId collectionId
@ -163,8 +224,9 @@ export async function updateData2Dataset({
q = '', q = '',
a, a,
indexes, indexes,
model model,
}: UpdateDatasetDataProps & { model: string }) { indexSize = 512
}: UpdateDatasetDataProps & { model: string; indexSize?: number }) {
if (!Array.isArray(indexes)) { if (!Array.isArray(indexes)) {
return Promise.reject('indexes is required'); return Promise.reject('indexes is required');
} }
@ -174,7 +236,7 @@ export async function updateData2Dataset({
if (!mongoData) return Promise.reject('core.dataset.error.Data not found'); if (!mongoData) return Promise.reject('core.dataset.error.Data not found');
// 2. Compute indexes // 2. Compute indexes
const formatIndexesResult = formatIndexes({ indexes, q, a }); const formatIndexesResult = await formatIndexes({ indexes, q, a, indexSize });
// 3. Patch indexes, create, update, delete // 3. Patch indexes, create, update, delete
const patchResult: PatchIndexesProps[] = []; const patchResult: PatchIndexesProps[] = [];

View File

@ -21,6 +21,11 @@ import {
llmCompletionsBodyFormat, llmCompletionsBodyFormat,
llmStreamResponseToAnswerText llmStreamResponseToAnswerText
} from '@fastgpt/service/core/ai/utils'; } from '@fastgpt/service/core/ai/utils';
import { LLMModelItemType } from '@fastgpt/global/core/ai/model.d';
import {
chunkAutoChunkSize,
getLLMMaxChunkSize
} from '@fastgpt/global/core/dataset/training/utils';
const reduceQueue = () => { const reduceQueue = () => {
global.qaQueueLen = global.qaQueueLen > 0 ? global.qaQueueLen - 1 : 0; global.qaQueueLen = global.qaQueueLen > 0 ? global.qaQueueLen - 1 : 0;
@ -129,7 +134,7 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
}); });
const answer = await llmStreamResponseToAnswerText(chatResponse); const answer = await llmStreamResponseToAnswerText(chatResponse);
const qaArr = formatSplitText(answer, text); // 格式化后的QA对 const qaArr = formatSplitText({ answer, rawText: text, llmModel: modelData }); // 格式化后的QA对
addLog.info(`[QA Queue] Finish`, { addLog.info(`[QA Queue] Finish`, {
time: Date.now() - startTime, time: Date.now() - startTime,
@ -180,10 +185,18 @@ ${replaceVariable(Prompt_AgentQA.fixedText, { text })}`;
} }
// Format qa answer // Format qa answer
function formatSplitText(text: string, rawText: string) { function formatSplitText({
text = text.replace(/\\n/g, '\n'); // 将换行符替换为空格 answer,
rawText,
llmModel
}: {
answer: string;
rawText: string;
llmModel: LLMModelItemType;
}) {
answer = answer.replace(/\\n/g, '\n'); // 将换行符替换为空格
const regex = /Q\d+:(\s*)(.*)(\s*)A\d+:(\s*)([\s\S]*?)(?=Q\d|$)/g; // 匹配Q和A的正则表达式 const regex = /Q\d+:(\s*)(.*)(\s*)A\d+:(\s*)([\s\S]*?)(?=Q\d|$)/g; // 匹配Q和A的正则表达式
const matches = text.matchAll(regex); // 获取所有匹配到的结果 const matches = answer.matchAll(regex); // 获取所有匹配到的结果
const result: PushDatasetDataChunkProps[] = []; // 存储最终的结果 const result: PushDatasetDataChunkProps[] = []; // 存储最终的结果
for (const match of matches) { for (const match of matches) {
@ -199,7 +212,11 @@ function formatSplitText(text: string, rawText: string) {
// empty result. direct split chunk // empty result. direct split chunk
if (result.length === 0) { if (result.length === 0) {
const { chunks } = splitText2Chunks({ text: rawText, chunkLen: 512 }); const { chunks } = splitText2Chunks({
text: rawText,
chunkSize: chunkAutoChunkSize,
maxSize: getLLMMaxChunkSize(llmModel)
});
chunks.forEach((chunk) => { chunks.forEach((chunk) => {
result.push({ result.push({
q: chunk, q: chunk,

View File

@ -245,7 +245,7 @@ const insertData = async ({
a: trainingData.a, a: trainingData.a,
chunkIndex: trainingData.chunkIndex, chunkIndex: trainingData.chunkIndex,
indexes: trainingData.indexes, indexes: trainingData.indexes,
model: trainingData.model, embeddingModel: trainingData.model,
session session
}); });
// delete data from training // delete data from training

View File

@ -60,15 +60,11 @@ export const defaultCollectionDetail: DatasetCollectionItemType = {
createTime: new Date(), createTime: new Date(),
trainingType: DatasetCollectionDataProcessModeEnum.chunk, trainingType: DatasetCollectionDataProcessModeEnum.chunk,
chunkSize: 0, chunkSize: 0,
indexSize: 512,
permission: new DatasetPermission(), permission: new DatasetPermission(),
indexAmount: 0 indexAmount: 0
}; };
export enum ChunkSettingModeEnum {
auto = 'auto',
custom = 'custom'
}
export const datasetTypeCourseMap: Record<`${DatasetTypeEnum}`, string> = { export const datasetTypeCourseMap: Record<`${DatasetTypeEnum}`, string> = {
[DatasetTypeEnum.folder]: '', [DatasetTypeEnum.folder]: '',
[DatasetTypeEnum.dataset]: '', [DatasetTypeEnum.dataset]: '',

View File

@ -1,6 +1,6 @@
import type { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api'; import type { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api';
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants'; import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { ChunkSettingModeEnum } from './constants'; import { ChunkSettingModeEnum } from '@fastgpt/global/core/dataset/constants';
import { UseFormReturn } from 'react-hook-form'; import { UseFormReturn } from 'react-hook-form';
import { APIFileItem } from '@fastgpt/global/core/dataset/apiDataset'; import { APIFileItem } from '@fastgpt/global/core/dataset/apiDataset';
@ -41,7 +41,7 @@ export type ImportSourceParamsType = UseFormReturn<
{ {
chunkSize: number; chunkSize: number;
chunkOverlapRatio: number; chunkOverlapRatio: number;
customSplitChar: string; chunkSplitter: string;
prompt: string; prompt: string;
mode: TrainingModeEnum; mode: TrainingModeEnum;
way: ChunkSettingModeEnum; way: ChunkSettingModeEnum;