diff --git a/docSite/content/zh-cn/docs/development/upgrading/4911.md b/docSite/content/zh-cn/docs/development/upgrading/4911.md index 9c9410b17..ce9e37a24 100644 --- a/docSite/content/zh-cn/docs/development/upgrading/4911.md +++ b/docSite/content/zh-cn/docs/development/upgrading/4911.md @@ -10,8 +10,9 @@ weight: 789 ## 🚀 新增内容 -1. 工作流中增加节点搜索功能。 -2. 工作流中,子流程版本控制,可选择“保持最新版本”,无需手动更新。 +1. 商业版支持图片知识库。 +2. 工作流中增加节点搜索功能。 +3. 工作流中,子流程版本控制,可选择“保持最新版本”,无需手动更新。 ## ⚙️ 优化 diff --git a/packages/global/common/file/icon.ts b/packages/global/common/file/icon.ts index 40928c5c2..f2baf7613 100644 --- a/packages/global/common/file/icon.ts +++ b/packages/global/common/file/icon.ts @@ -6,7 +6,8 @@ export const fileImgs = [ { suffix: '(doc|docs)', src: 'file/fill/doc' }, { suffix: 'txt', src: 'file/fill/txt' }, { suffix: 'md', src: 'file/fill/markdown' }, - { suffix: 'html', src: 'file/fill/html' } + { suffix: 'html', src: 'file/fill/html' }, + { suffix: '(jpg|jpeg|png|gif|bmp|webp|svg|ico|tiff|tif)', src: 'image' } // { suffix: '.', src: '/imgs/files/file.svg' } ]; diff --git a/packages/global/common/frequenctLimit/type.d.ts b/packages/global/common/frequenctLimit/type.d.ts index 3f326a40c..fb5bf2105 100644 --- a/packages/global/common/frequenctLimit/type.d.ts +++ b/packages/global/common/frequenctLimit/type.d.ts @@ -2,4 +2,5 @@ export type AuthFrequencyLimitProps = { eventId: string; maxAmount: number; expiredTime: Date; + num?: number; }; diff --git a/packages/global/common/string/tools.ts b/packages/global/common/string/tools.ts index 39ca3d24c..dc4c845a9 100644 --- a/packages/global/common/string/tools.ts +++ b/packages/global/common/string/tools.ts @@ -34,7 +34,7 @@ export const valToStr = (val: any) => { }; // replace {{variable}} to value -export function replaceVariable(text: any, obj: Record) { +export function replaceVariable(text: any, obj: Record) { if (typeof text !== 'string') return text; for (const key in obj) { diff --git a/packages/global/core/dataset/api.d.ts b/packages/global/core/dataset/api.d.ts index 92dc32ed3..e854b3194 100644 --- a/packages/global/core/dataset/api.d.ts +++ b/packages/global/core/dataset/api.d.ts @@ -1,4 +1,9 @@ -import type { ChunkSettingsType, DatasetDataIndexItemType, DatasetSchemaType } from './type'; +import type { + ChunkSettingsType, + DatasetDataIndexItemType, + DatasetDataFieldType, + DatasetSchemaType +} from './type'; import type { DatasetCollectionTypeEnum, DatasetCollectionDataProcessModeEnum, @@ -7,8 +12,7 @@ import type { ChunkTriggerConfigTypeEnum, ParagraphChunkAIModeEnum } from './constants'; -import type { LLMModelItemType } from '../ai/model.d'; -import type { ParentIdType } from 'common/parentFolder/type'; +import type { ParentIdType } from '../../common/parentFolder/type'; /* ================= dataset ===================== */ export type DatasetUpdateBody = { @@ -100,6 +104,9 @@ export type ExternalFileCreateDatasetCollectionParams = ApiCreateDatasetCollecti externalFileUrl: string; filename?: string; }; +export type ImageCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams & { + collectionName: string; +}; /* ================= tag ===================== */ export type CreateDatasetCollectionTagParams = { @@ -125,8 +132,9 @@ export type PgSearchRawType = { score: number; }; export type PushDatasetDataChunkProps = { - q: string; // embedding content - a?: string; // bonus content + q?: string; + a?: string; + imageId?: string; chunkIndex?: number; indexes?: Omit[]; }; diff --git a/packages/global/core/dataset/constants.ts b/packages/global/core/dataset/constants.ts index 25acca563..f8c651255 100644 --- a/packages/global/core/dataset/constants.ts +++ b/packages/global/core/dataset/constants.ts @@ -77,7 +77,8 @@ export enum DatasetCollectionTypeEnum { file = 'file', link = 'link', // one link externalFile = 'externalFile', - apiFile = 'apiFile' + apiFile = 'apiFile', + images = 'images' } export const DatasetCollectionTypeMap = { [DatasetCollectionTypeEnum.folder]: { @@ -97,6 +98,9 @@ export const DatasetCollectionTypeMap = { }, [DatasetCollectionTypeEnum.apiFile]: { name: i18nT('common:core.dataset.apiFile') + }, + [DatasetCollectionTypeEnum.images]: { + name: i18nT('dataset:core.dataset.Image collection') } }; @@ -120,6 +124,7 @@ export const DatasetCollectionSyncResultMap = { export enum DatasetCollectionDataProcessModeEnum { chunk = 'chunk', qa = 'qa', + imageParse = 'imageParse', backup = 'backup', auto = 'auto' // abandon @@ -133,6 +138,10 @@ export const DatasetCollectionDataProcessModeMap = { label: i18nT('common:core.dataset.training.QA mode'), tooltip: i18nT('common:core.dataset.import.QA Import Tip') }, + [DatasetCollectionDataProcessModeEnum.imageParse]: { + label: i18nT('dataset:training.Image mode'), + tooltip: i18nT('common:core.dataset.import.Chunk Split Tip') + }, [DatasetCollectionDataProcessModeEnum.backup]: { label: i18nT('dataset:backup_mode'), tooltip: i18nT('dataset:backup_mode') @@ -172,14 +181,16 @@ export enum ImportDataSourceEnum { fileCustom = 'fileCustom', externalFile = 'externalFile', apiDataset = 'apiDataset', - reTraining = 'reTraining' + reTraining = 'reTraining', + imageDataset = 'imageDataset' } export enum TrainingModeEnum { chunk = 'chunk', qa = 'qa', auto = 'auto', - image = 'image' + image = 'image', + imageParse = 'imageParse' } /* ------------ search -------------- */ diff --git a/packages/global/core/dataset/controller.d.ts b/packages/global/core/dataset/controller.d.ts index 7a90ae5eb..3edf0954a 100644 --- a/packages/global/core/dataset/controller.d.ts +++ b/packages/global/core/dataset/controller.d.ts @@ -8,17 +8,19 @@ export type CreateDatasetDataProps = { chunkIndex?: number; q: string; a?: string; + imageId?: string; indexes?: Omit[]; }; export type UpdateDatasetDataProps = { dataId: string; - q?: string; + q: string; a?: string; indexes?: (Omit & { dataId?: string; // pg data id })[]; + imageId?: string; }; export type PatchIndexesProps = diff --git a/packages/global/core/dataset/image/type.d.ts b/packages/global/core/dataset/image/type.d.ts new file mode 100644 index 000000000..ed6dabe58 --- /dev/null +++ b/packages/global/core/dataset/image/type.d.ts @@ -0,0 +1,13 @@ +export type DatasetImageSchema = { + _id: string; + teamId: string; + datasetId: string; + collectionId?: string; + name: string; + contentType: string; + size: number; + metadata?: Record; + expiredTime?: Date; + createdAt: Date; + updatedAt: Date; +}; diff --git a/packages/global/core/dataset/type.d.ts b/packages/global/core/dataset/type.d.ts index f3424eaa3..57b7a9bba 100644 --- a/packages/global/core/dataset/type.d.ts +++ b/packages/global/core/dataset/type.d.ts @@ -16,6 +16,7 @@ import type { DatasetPermission } from '../../support/permission/dataset/control import type { APIFileServer, FeishuServer, YuqueServer } from './apiDataset'; import type { SourceMemberType } from 'support/user/type'; import type { DatasetDataIndexTypeEnum } from './data/constants'; +import type { ParentIdType } from 'common/parentFolder/type'; export type ChunkSettingsType = { trainingType?: DatasetCollectionDataProcessModeEnum; @@ -49,7 +50,7 @@ export type ChunkSettingsType = { export type DatasetSchemaType = { _id: string; - parentId?: string; + parentId: ParentIdType; userId: string; teamId: string; tmbId: string; @@ -132,7 +133,13 @@ export type DatasetDataIndexItemType = { dataId: string; // pg data id text: string; }; -export type DatasetDataSchemaType = { + +export type DatasetDataFieldType = { + q: string; // large chunks or question + a?: string; // answer or custom content + imageId?: string; +}; +export type DatasetDataSchemaType = DatasetDataFieldType & { _id: string; userId: string; teamId: string; @@ -141,13 +148,9 @@ export type DatasetDataSchemaType = { collectionId: string; chunkIndex: number; updateTime: Date; - q: string; // large chunks or question - a: string; // answer or custom content - history?: { - q: string; - a: string; + history?: (DatasetDataFieldType & { updateTime: Date; - }[]; + })[]; forbid?: boolean; fullTextToken: string; indexes: DatasetDataIndexItemType[]; @@ -179,6 +182,7 @@ export type DatasetTrainingSchemaType = { dataId?: string; q: string; a: string; + imageId?: string; chunkIndex: number; indexSize?: number; weight: number; @@ -244,20 +248,18 @@ export type DatasetCollectionItemType = CollectionWithDatasetType & { }; /* ================= data ===================== */ -export type DatasetDataItemType = { +export type DatasetDataItemType = DatasetDataFieldType & { id: string; teamId: string; datasetId: string; + imagePreivewUrl?: string; updateTime: Date; collectionId: string; sourceName: string; sourceId?: string; - q: string; - a: string; chunkIndex: number; indexes: DatasetDataIndexItemType[]; isOwner: boolean; - // permission: DatasetPermission; }; /* --------------- file ---------------------- */ diff --git a/packages/global/core/dataset/utils.ts b/packages/global/core/dataset/utils.ts index 17167b12a..42dbc2315 100644 --- a/packages/global/core/dataset/utils.ts +++ b/packages/global/core/dataset/utils.ts @@ -2,10 +2,15 @@ import { TrainingModeEnum, DatasetCollectionTypeEnum } from './constants'; import { getFileIcon } from '../../common/file/icon'; import { strIsLink } from '../../common/string/tools'; -export function getCollectionIcon( - type: DatasetCollectionTypeEnum = DatasetCollectionTypeEnum.file, - name = '' -) { +export function getCollectionIcon({ + type = DatasetCollectionTypeEnum.file, + name = '', + sourceId +}: { + type?: DatasetCollectionTypeEnum; + name?: string; + sourceId?: string; +}) { if (type === DatasetCollectionTypeEnum.folder) { return 'common/folderFill'; } @@ -15,7 +20,10 @@ export function getCollectionIcon( if (type === DatasetCollectionTypeEnum.virtual) { return 'file/fill/manual'; } - return getFileIcon(name); + if (type === DatasetCollectionTypeEnum.images) { + return 'core/dataset/imageFill'; + } + return getSourceNameIcon({ sourceName: name, sourceId }); } export function getSourceNameIcon({ sourceName, diff --git a/packages/service/common/buffer/rawText/controller.ts b/packages/service/common/buffer/rawText/controller.ts index 59370d033..8750494a3 100644 --- a/packages/service/common/buffer/rawText/controller.ts +++ b/packages/service/common/buffer/rawText/controller.ts @@ -142,23 +142,26 @@ export const updateRawTextBufferExpiredTime = async ({ }; export const clearExpiredRawTextBufferCron = async () => { + const gridBucket = getGridBucket(); + const clearExpiredRawTextBuffer = async () => { addLog.debug('Clear expired raw text buffer start'); - const gridBucket = getGridBucket(); - return retryFn(async () => { - const data = await MongoRawTextBufferSchema.find( - { - 'metadata.expiredTime': { $lt: new Date() } - }, - '_id' - ).lean(); + const data = await MongoRawTextBufferSchema.find( + { + 'metadata.expiredTime': { $lt: new Date() } + }, + '_id' + ).lean(); - for (const item of data) { + for (const item of data) { + try { await gridBucket.delete(item._id); + } catch (error) { + addLog.error('Delete expired raw text buffer error', error); } - addLog.debug('Clear expired raw text buffer end'); - }); + } + addLog.debug('Clear expired raw text buffer end'); }; setCron('*/10 * * * *', async () => { diff --git a/packages/service/common/file/gridfs/controller.ts b/packages/service/common/file/gridfs/controller.ts index 05708ed20..b3a694b76 100644 --- a/packages/service/common/file/gridfs/controller.ts +++ b/packages/service/common/file/gridfs/controller.ts @@ -7,12 +7,13 @@ import { MongoChatFileSchema, MongoDatasetFileSchema } from './schema'; import { detectFileEncoding, detectFileEncodingByPath } from '@fastgpt/global/common/file/tools'; import { CommonErrEnum } from '@fastgpt/global/common/error/code/common'; import { readRawContentByFileBuffer } from '../read/utils'; -import { gridFsStream2Buffer, stream2Encoding } from './utils'; +import { computeGridFsChunSize, gridFsStream2Buffer, stream2Encoding } from './utils'; import { addLog } from '../../system/log'; import { parseFileExtensionFromUrl } from '@fastgpt/global/common/string/tools'; import { Readable } from 'stream'; import { addRawTextBuffer, getRawTextBuffer } from '../../buffer/rawText/controller'; import { addMinutes } from 'date-fns'; +import { retryFn } from '@fastgpt/global/common/system/utils'; export function getGFSCollection(bucket: `${BucketNameEnum}`) { MongoDatasetFileSchema; @@ -64,23 +65,7 @@ export async function uploadFile({ // create a gridfs bucket const bucket = getGridBucket(bucketName); - const fileSize = stats.size; - // 单块大小:尽可能大,但不超过 14MB,不小于512KB - const chunkSizeBytes = (() => { - // 计算理想块大小:文件大小 ÷ 目标块数(10)。 并且每个块需要小于 14MB - const idealChunkSize = Math.min(Math.ceil(fileSize / 10), 14 * 1024 * 1024); - - // 确保块大小至少为512KB - const minChunkSize = 512 * 1024; // 512KB - - // 取理想块大小和最小块大小中的较大值 - let chunkSize = Math.max(idealChunkSize, minChunkSize); - - // 将块大小向上取整到最接近的64KB的倍数,使其更整齐 - chunkSize = Math.ceil(chunkSize / (64 * 1024)) * (64 * 1024); - - return chunkSize; - })(); + const chunkSizeBytes = computeGridFsChunSize(stats.size); const stream = bucket.openUploadStream(filename, { metadata, @@ -173,24 +158,18 @@ export async function getFileById({ export async function delFileByFileIdList({ bucketName, - fileIdList, - retry = 3 + fileIdList }: { bucketName: `${BucketNameEnum}`; fileIdList: string[]; - retry?: number; }): Promise { - try { + return retryFn(async () => { const bucket = getGridBucket(bucketName); for await (const fileId of fileIdList) { await bucket.delete(new Types.ObjectId(fileId)); } - } catch (error) { - if (retry > 0) { - return delFileByFileIdList({ bucketName, fileIdList, retry: retry - 1 }); - } - } + }); } export async function getDownloadStream({ diff --git a/packages/service/common/file/gridfs/utils.ts b/packages/service/common/file/gridfs/utils.ts index c743b7136..4c72fb61d 100644 --- a/packages/service/common/file/gridfs/utils.ts +++ b/packages/service/common/file/gridfs/utils.ts @@ -105,3 +105,20 @@ export const stream2Encoding = async (stream: NodeJS.ReadableStream) => { stream: copyStream }; }; + +// 单块大小:尽可能大,但不超过 14MB,不小于512KB +export const computeGridFsChunSize = (fileSize: number) => { + // 计算理想块大小:文件大小 ÷ 目标块数(10)。 并且每个块需要小于 14MB + const idealChunkSize = Math.min(Math.ceil(fileSize / 10), 14 * 1024 * 1024); + + // 确保块大小至少为512KB + const minChunkSize = 512 * 1024; // 512KB + + // 取理想块大小和最小块大小中的较大值 + let chunkSize = Math.max(idealChunkSize, minChunkSize); + + // 将块大小向上取整到最接近的64KB的倍数,使其更整齐 + chunkSize = Math.ceil(chunkSize / (64 * 1024)) * (64 * 1024); + + return chunkSize; +}; diff --git a/packages/service/common/file/multer.ts b/packages/service/common/file/multer.ts index f2c159085..235a61df9 100644 --- a/packages/service/common/file/multer.ts +++ b/packages/service/common/file/multer.ts @@ -22,7 +22,7 @@ export const getUploadModel = ({ maxSize = 500 }: { maxSize?: number }) => { maxSize *= 1024 * 1024; class UploadModel { - uploader = multer({ + uploaderSingle = multer({ limits: { fieldSize: maxSize }, @@ -41,8 +41,7 @@ export const getUploadModel = ({ maxSize = 500 }: { maxSize?: number }) => { } }) }).single('file'); - - async doUpload( + async getUploadFile( req: NextApiRequest, res: NextApiResponse, originBucketName?: `${BucketNameEnum}` @@ -54,7 +53,7 @@ export const getUploadModel = ({ maxSize = 500 }: { maxSize?: number }) => { bucketName?: `${BucketNameEnum}`; }>((resolve, reject) => { // @ts-ignore - this.uploader(req, res, (error) => { + this.uploaderSingle(req, res, (error) => { if (error) { return reject(error); } @@ -94,6 +93,58 @@ export const getUploadModel = ({ maxSize = 500 }: { maxSize?: number }) => { }); }); } + + uploaderMultiple = multer({ + limits: { + fieldSize: maxSize + }, + preservePath: true, + storage: multer.diskStorage({ + // destination: (_req, _file, cb) => { + // cb(null, tmpFileDirPath); + // }, + filename: (req, file, cb) => { + if (!file?.originalname) { + cb(new Error('File not found'), ''); + } else { + const { ext } = path.parse(decodeURIComponent(file.originalname)); + cb(null, `${getNanoid()}${ext}`); + } + } + }) + }).array('file', global.feConfigs?.uploadFileMaxSize); + async getUploadFiles(req: NextApiRequest, res: NextApiResponse) { + return new Promise<{ + files: FileType[]; + data: T; + }>((resolve, reject) => { + // @ts-ignore + this.uploaderMultiple(req, res, (error) => { + if (error) { + console.log(error); + return reject(error); + } + + // @ts-ignore + const files = req.files as FileType[]; + + resolve({ + files: files.map((file) => ({ + ...file, + originalname: decodeURIComponent(file.originalname) + })), + data: (() => { + if (!req.body?.data) return {}; + try { + return JSON.parse(req.body.data); + } catch (error) { + return {}; + } + })() + }); + }); + }); + } } return new UploadModel(); diff --git a/packages/service/common/system/frequencyLimit/utils.ts b/packages/service/common/system/frequencyLimit/utils.ts index 50cb7c88b..e62cf1679 100644 --- a/packages/service/common/system/frequencyLimit/utils.ts +++ b/packages/service/common/system/frequencyLimit/utils.ts @@ -4,7 +4,8 @@ import { MongoFrequencyLimit } from './schema'; export const authFrequencyLimit = async ({ eventId, maxAmount, - expiredTime + expiredTime, + num = 1 }: AuthFrequencyLimitProps) => { try { // 对应 eventId 的 account+1, 不存在的话,则创建一个 @@ -14,7 +15,7 @@ export const authFrequencyLimit = async ({ expiredTime: { $gte: new Date() } }, { - $inc: { amount: 1 }, + $inc: { amount: num }, // If not exist, set the expiredTime $setOnInsert: { expiredTime } }, diff --git a/packages/service/common/system/timerLock/constants.ts b/packages/service/common/system/timerLock/constants.ts index 010711257..76189686c 100644 --- a/packages/service/common/system/timerLock/constants.ts +++ b/packages/service/common/system/timerLock/constants.ts @@ -6,7 +6,9 @@ export enum TimerIdEnum { updateStandardPlan = 'updateStandardPlan', scheduleTriggerApp = 'scheduleTriggerApp', notification = 'notification', - clearExpiredRawTextBuffer = 'clearExpiredRawTextBuffer' + + clearExpiredRawTextBuffer = 'clearExpiredRawTextBuffer', + clearExpiredDatasetImage = 'clearExpiredDatasetImage' } export enum LockNotificationEnum { diff --git a/packages/service/core/ai/model.ts b/packages/service/core/ai/model.ts index 2c53498c5..55a4f9747 100644 --- a/packages/service/core/ai/model.ts +++ b/packages/service/core/ai/model.ts @@ -20,6 +20,10 @@ export const getVlmModel = (model?: string) => { ?.find((item) => item.model === model || item.name === model); }; +export const getVlmModelList = () => { + return Array.from(global.llmModelMap.values())?.filter((item) => item.vision) || []; +}; + export const getDefaultEmbeddingModel = () => global?.systemDefaultModel.embedding!; export const getEmbeddingModel = (model?: string) => { if (!model) return getDefaultEmbeddingModel(); diff --git a/packages/service/core/dataset/collection/controller.ts b/packages/service/core/dataset/collection/controller.ts index c734e8896..ffa15f5cd 100644 --- a/packages/service/core/dataset/collection/controller.ts +++ b/packages/service/core/dataset/collection/controller.ts @@ -5,9 +5,10 @@ import { } from '@fastgpt/global/core/dataset/constants'; import type { CreateDatasetCollectionParams } from '@fastgpt/global/core/dataset/api.d'; import { MongoDatasetCollection } from './schema'; -import { - type DatasetCollectionSchemaType, - type DatasetSchemaType +import type { + DatasetCollectionSchemaType, + DatasetDataFieldType, + DatasetSchemaType } from '@fastgpt/global/core/dataset/type'; import { MongoDatasetTraining } from '../training/schema'; import { MongoDatasetData } from '../data/schema'; @@ -15,7 +16,7 @@ import { delImgByRelatedId } from '../../../common/file/image/controller'; import { deleteDatasetDataVector } from '../../../common/vectorDB/controller'; import { delFileByFileIdList } from '../../../common/file/gridfs/controller'; import { BucketNameEnum } from '@fastgpt/global/common/file/constants'; -import { type ClientSession } from '../../../common/mongo'; +import type { ClientSession } from '../../../common/mongo'; import { createOrGetCollectionTags } from './utils'; import { rawText2Chunks } from '../read'; import { checkDatasetLimit } from '../../../support/permission/teamLimit'; @@ -38,20 +39,25 @@ import { getLLMMaxChunkSize } from '@fastgpt/global/core/dataset/training/utils'; import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants'; +import { deleteDatasetImage } from '../image/controller'; +import { clearCollectionImages, removeDatasetImageExpiredTime } from '../image/utils'; export const createCollectionAndInsertData = async ({ dataset, rawText, relatedId, + imageIds, createCollectionParams, backupParse = false, billId, session }: { dataset: DatasetSchemaType; - rawText: string; + rawText?: string; relatedId?: string; + imageIds?: string[]; createCollectionParams: CreateOneCollectionParams; + backupParse?: boolean; billId?: string; @@ -69,13 +75,13 @@ export const createCollectionAndInsertData = async ({ // Set default params const trainingType = createCollectionParams.trainingType || DatasetCollectionDataProcessModeEnum.chunk; - const chunkSize = computeChunkSize({ - ...createCollectionParams, - trainingType, - llmModel: getLLMModel(dataset.agentModel) - }); const chunkSplitter = computeChunkSplitter(createCollectionParams); const paragraphChunkDeep = computeParagraphChunkDeep(createCollectionParams); + const trainingMode = getTrainingModeByCollection({ + trainingType: trainingType, + autoIndexes: createCollectionParams.autoIndexes, + imageIndex: createCollectionParams.imageIndex + }); if ( trainingType === DatasetCollectionDataProcessModeEnum.qa || @@ -90,35 +96,60 @@ export const createCollectionAndInsertData = async ({ delete createCollectionParams.qaPrompt; } - // 1. split chunks - const chunks = rawText2Chunks({ - rawText, - chunkTriggerType: createCollectionParams.chunkTriggerType, - chunkTriggerMinSize: createCollectionParams.chunkTriggerMinSize, - chunkSize, - paragraphChunkDeep, - paragraphChunkMinSize: createCollectionParams.paragraphChunkMinSize, - maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)), - overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0, - customReg: chunkSplitter ? [chunkSplitter] : [], - backupParse - }); + // 1. split chunks or create image chunks + const { + chunks, + chunkSize + }: { + chunks: Array<{ + q?: string; + a?: string; // answer or custom content + imageId?: string; + indexes?: string[]; + }>; + chunkSize?: number; + } = (() => { + if (rawText) { + const chunkSize = computeChunkSize({ + ...createCollectionParams, + trainingType, + llmModel: getLLMModel(dataset.agentModel) + }); + // Process text chunks + const chunks = rawText2Chunks({ + rawText, + chunkTriggerType: createCollectionParams.chunkTriggerType, + chunkTriggerMinSize: createCollectionParams.chunkTriggerMinSize, + chunkSize, + paragraphChunkDeep, + paragraphChunkMinSize: createCollectionParams.paragraphChunkMinSize, + maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)), + overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0, + customReg: chunkSplitter ? [chunkSplitter] : [], + backupParse + }); + return { chunks, chunkSize }; + } + + if (imageIds) { + // Process image chunks + const chunks = imageIds.map((imageId: string) => ({ + imageId, + indexes: [] + })); + return { chunks }; + } + throw new Error('Either rawText or imageIdList must be provided'); + })(); // 2. auth limit await checkDatasetLimit({ teamId, - insertLen: predictDataLimitLength( - getTrainingModeByCollection({ - trainingType: trainingType, - autoIndexes: createCollectionParams.autoIndexes, - imageIndex: createCollectionParams.imageIndex - }), - chunks - ) + insertLen: predictDataLimitLength(trainingMode, chunks) }); const fn = async (session: ClientSession) => { - // 3. create collection + // 3. Create collection const { _id: collectionId } = await createOneCollection({ ...createCollectionParams, trainingType, @@ -126,8 +157,8 @@ export const createCollectionAndInsertData = async ({ chunkSize, chunkSplitter, - hashRawText: hashStr(rawText), - rawTextLength: rawText.length, + hashRawText: rawText ? hashStr(rawText) : undefined, + rawTextLength: rawText?.length, nextSyncTime: (() => { // ignore auto collections sync for website datasets if (!dataset.autoSync && dataset.type === DatasetTypeEnum.websiteDataset) return undefined; @@ -169,11 +200,7 @@ export const createCollectionAndInsertData = async ({ vectorModel: dataset.vectorModel, vlmModel: dataset.vlmModel, indexSize: createCollectionParams.indexSize, - mode: getTrainingModeByCollection({ - trainingType: trainingType, - autoIndexes: createCollectionParams.autoIndexes, - imageIndex: createCollectionParams.imageIndex - }), + mode: trainingMode, prompt: createCollectionParams.qaPrompt, billId: traingBillId, data: chunks.map((item, index) => ({ @@ -187,7 +214,12 @@ export const createCollectionAndInsertData = async ({ session }); - // 6. remove related image ttl + // 6. Remove images ttl index + await removeDatasetImageExpiredTime({ + ids: imageIds, + collectionId, + session + }); if (relatedId) { await MongoImage.updateMany( { @@ -207,7 +239,7 @@ export const createCollectionAndInsertData = async ({ } return { - collectionId, + collectionId: String(collectionId), insertResults }; }; @@ -288,17 +320,20 @@ export const delCollectionRelatedSource = async ({ .map((item) => item?.metadata?.relatedImgId || '') .filter(Boolean); - // Delete files - await delFileByFileIdList({ - bucketName: BucketNameEnum.dataset, - fileIdList - }); - // Delete images - await delImgByRelatedId({ - teamId, - relateIds: relatedImageIds, - session - }); + // Delete files and images in parallel + await Promise.all([ + // Delete files + delFileByFileIdList({ + bucketName: BucketNameEnum.dataset, + fileIdList + }), + // Delete images + delImgByRelatedId({ + teamId, + relateIds: relatedImageIds, + session + }) + ]); }; /** * delete collection and it related data @@ -343,16 +378,16 @@ export async function delCollection({ datasetId: { $in: datasetIds }, collectionId: { $in: collectionIds } }), + // Delete dataset_images + clearCollectionImages(collectionIds), + // Delete images if needed ...(delImg - ? [ - delImgByRelatedId({ - teamId, - relateIds: collections - .map((item) => item?.metadata?.relatedImgId || '') - .filter(Boolean) - }) - ] + ? collections + .map((item) => item?.metadata?.relatedImgId || '') + .filter(Boolean) + .map((imageId) => deleteDatasetImage(imageId)) : []), + // Delete files if needed ...(delFile ? [ delFileByFileIdList({ diff --git a/packages/service/core/dataset/collection/utils.ts b/packages/service/core/dataset/collection/utils.ts index 96310ecb3..bd7349cc5 100644 --- a/packages/service/core/dataset/collection/utils.ts +++ b/packages/service/core/dataset/collection/utils.ts @@ -1,11 +1,9 @@ import { MongoDatasetCollection } from './schema'; -import { type ClientSession } from '../../../common/mongo'; +import type { ClientSession } from '../../../common/mongo'; import { MongoDatasetCollectionTags } from '../tag/schema'; import { readFromSecondary } from '../../../common/mongo/utils'; -import { - type CollectionWithDatasetType, - type DatasetCollectionSchemaType -} from '@fastgpt/global/core/dataset/type'; +import type { CollectionWithDatasetType } from '@fastgpt/global/core/dataset/type'; +import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type'; import { DatasetCollectionDataProcessModeEnum, DatasetCollectionSyncResultEnum, @@ -233,18 +231,37 @@ export const syncCollection = async (collection: CollectionWithDatasetType) => { QA: 独立进程 Chunk: Image Index -> Auto index -> chunk index */ -export const getTrainingModeByCollection = (collection: { - trainingType: DatasetCollectionSchemaType['trainingType']; - autoIndexes?: DatasetCollectionSchemaType['autoIndexes']; - imageIndex?: DatasetCollectionSchemaType['imageIndex']; +export const getTrainingModeByCollection = ({ + trainingType, + autoIndexes, + imageIndex +}: { + trainingType: DatasetCollectionDataProcessModeEnum; + autoIndexes?: boolean; + imageIndex?: boolean; }) => { - if (collection.trainingType === DatasetCollectionDataProcessModeEnum.qa) { + if ( + trainingType === DatasetCollectionDataProcessModeEnum.imageParse && + global.feConfigs?.isPlus + ) { + return TrainingModeEnum.imageParse; + } + + if (trainingType === DatasetCollectionDataProcessModeEnum.qa) { return TrainingModeEnum.qa; } - if (collection.imageIndex && global.feConfigs?.isPlus) { + if ( + trainingType === DatasetCollectionDataProcessModeEnum.chunk && + imageIndex && + global.feConfigs?.isPlus + ) { return TrainingModeEnum.image; } - if (collection.autoIndexes && global.feConfigs?.isPlus) { + if ( + trainingType === DatasetCollectionDataProcessModeEnum.chunk && + autoIndexes && + global.feConfigs?.isPlus + ) { return TrainingModeEnum.auto; } return TrainingModeEnum.chunk; diff --git a/packages/service/core/dataset/controller.ts b/packages/service/core/dataset/controller.ts index 1ae7a8d58..c8ab49910 100644 --- a/packages/service/core/dataset/controller.ts +++ b/packages/service/core/dataset/controller.ts @@ -9,6 +9,7 @@ import { deleteDatasetDataVector } from '../../common/vectorDB/controller'; import { MongoDatasetDataText } from './data/dataTextSchema'; import { DatasetErrEnum } from '@fastgpt/global/common/error/code/dataset'; import { retryFn } from '@fastgpt/global/common/system/utils'; +import { clearDatasetImages } from './image/utils'; /* ============= dataset ========== */ /* find all datasetId by top datasetId */ @@ -102,8 +103,10 @@ export async function delDatasetRelevantData({ }), //delete dataset_datas MongoDatasetData.deleteMany({ teamId, datasetId: { $in: datasetIds } }), - // Delete Image and file + // Delete collection image and file delCollectionRelatedSource({ collections }), + // Delete dataset Image + clearDatasetImages(datasetIds), // Delete vector data deleteDatasetDataVector({ teamId, datasetIds }) ]); diff --git a/packages/service/core/dataset/data/controller.ts b/packages/service/core/dataset/data/controller.ts new file mode 100644 index 000000000..08d468940 --- /dev/null +++ b/packages/service/core/dataset/data/controller.ts @@ -0,0 +1,57 @@ +import { getDatasetImagePreviewUrl } from '../image/utils'; +import type { QuoteDataItemType } from '../../../../../projects/app/src/service/core/chat/constants'; +import type { DatasetDataSchemaType } from '@fastgpt/global/core/dataset/type'; + +export const formatDatasetDataValue = ({ + q, + a, + imageId, + teamId, + datasetId +}: { + q: string; + a?: string; + imageId?: string; + teamId: string; + datasetId: string; +}): { + q: string; + a?: string; + imagePreivewUrl?: string; +} => { + if (!imageId) { + return { + q, + a + }; + } + + const previewUrl = getDatasetImagePreviewUrl({ + imageId, + teamId, + datasetId, + expiredMinutes: 60 * 24 * 7 // 7 days + }); + + return { + q: `![${q.replaceAll('\n', '\\n')}](${previewUrl})`, + a, + imagePreivewUrl: previewUrl + }; +}; + +export const getFormatDatasetCiteList = (list: DatasetDataSchemaType[]) => { + return list.map((item) => ({ + _id: item._id, + ...formatDatasetDataValue({ + teamId: item.teamId, + datasetId: item.datasetId, + q: item.q, + a: item.a, + imageId: item.imageId + }), + history: item.history, + updateTime: item.updateTime, + index: item.chunkIndex + })); +}; diff --git a/packages/service/core/dataset/data/schema.ts b/packages/service/core/dataset/data/schema.ts index 0f5cefa5d..5b8d07e94 100644 --- a/packages/service/core/dataset/data/schema.ts +++ b/packages/service/core/dataset/data/schema.ts @@ -37,8 +37,7 @@ const DatasetDataSchema = new Schema({ required: true }, a: { - type: String, - default: '' + type: String }, history: { type: [ @@ -74,6 +73,9 @@ const DatasetDataSchema = new Schema({ default: [] }, + imageId: { + type: String + }, updateTime: { type: Date, default: () => new Date() diff --git a/packages/service/core/dataset/image/controller.ts b/packages/service/core/dataset/image/controller.ts new file mode 100644 index 000000000..0d3a44b08 --- /dev/null +++ b/packages/service/core/dataset/image/controller.ts @@ -0,0 +1,166 @@ +import { addMinutes } from 'date-fns'; +import { bucketName, MongoDatasetImageSchema } from './schema'; +import { connectionMongo, Types } from '../../../common/mongo'; +import fs from 'fs'; +import type { FileType } from '../../../common/file/multer'; +import fsp from 'fs/promises'; +import { computeGridFsChunSize } from '../../../common/file/gridfs/utils'; +import { setCron } from '../../../common/system/cron'; +import { checkTimerLock } from '../../../common/system/timerLock/utils'; +import { TimerIdEnum } from '../../../common/system/timerLock/constants'; +import { addLog } from '../../../common/system/log'; + +const getGridBucket = () => { + return new connectionMongo.mongo.GridFSBucket(connectionMongo.connection.db!, { + bucketName: bucketName + }); +}; + +export const createDatasetImage = async ({ + teamId, + datasetId, + file, + expiredTime = addMinutes(new Date(), 30) +}: { + teamId: string; + datasetId: string; + file: FileType; + expiredTime?: Date; +}): Promise<{ imageId: string; previewUrl: string }> => { + const path = file.path; + const gridBucket = getGridBucket(); + const metadata = { + teamId: String(teamId), + datasetId: String(datasetId), + expiredTime + }; + + const stats = await fsp.stat(path); + if (!stats.isFile()) return Promise.reject(`${path} is not a file`); + + const readStream = fs.createReadStream(path, { + highWaterMark: 256 * 1024 + }); + const chunkSizeBytes = computeGridFsChunSize(stats.size); + + const stream = gridBucket.openUploadStream(file.originalname, { + metadata, + contentType: file.mimetype, + chunkSizeBytes + }); + + // save to gridfs + await new Promise((resolve, reject) => { + readStream + .pipe(stream as any) + .on('finish', resolve) + .on('error', reject); + }); + + return { + imageId: String(stream.id), + previewUrl: '' + }; +}; + +export const getDatasetImageReadData = async (imageId: string) => { + // Get file metadata to get contentType + const fileInfo = await MongoDatasetImageSchema.findOne({ + _id: new Types.ObjectId(imageId) + }).lean(); + if (!fileInfo) { + return Promise.reject('Image not found'); + } + + const gridBucket = getGridBucket(); + return { + stream: gridBucket.openDownloadStream(new Types.ObjectId(imageId)), + fileInfo + }; +}; +export const getDatasetImageBase64 = async (imageId: string) => { + // Get file metadata to get contentType + const fileInfo = await MongoDatasetImageSchema.findOne({ + _id: new Types.ObjectId(imageId) + }).lean(); + if (!fileInfo) { + return Promise.reject('Image not found'); + } + + // Get image stream from GridFS + const { stream } = await getDatasetImageReadData(imageId); + + // Convert stream to buffer + const chunks: Buffer[] = []; + + return new Promise((resolve, reject) => { + stream.on('data', (chunk: Buffer) => { + chunks.push(chunk); + }); + + stream.on('end', () => { + // Combine all chunks into a single buffer + const buffer = Buffer.concat(chunks); + // Convert buffer to base64 string + const base64 = buffer.toString('base64'); + const dataUrl = `data:${fileInfo.contentType || 'image/jpeg'};base64,${base64}`; + resolve(dataUrl); + }); + + stream.on('error', reject); + }); +}; + +export const deleteDatasetImage = async (imageId: string) => { + const gridBucket = getGridBucket(); + + try { + await gridBucket.delete(new Types.ObjectId(imageId)); + } catch (error: any) { + const msg = error?.message; + if (msg.includes('File not found')) { + addLog.warn('Delete dataset image error', error); + return; + } else { + return Promise.reject(error); + } + } +}; + +export const clearExpiredDatasetImageCron = async () => { + const gridBucket = getGridBucket(); + const clearExpiredDatasetImages = async () => { + addLog.debug('Clear expired dataset image start'); + + const data = await MongoDatasetImageSchema.find( + { + 'metadata.expiredTime': { $lt: new Date() } + }, + '_id' + ).lean(); + + for (const item of data) { + try { + await gridBucket.delete(item._id); + } catch (error) { + addLog.error('Delete expired dataset image error', error); + } + } + addLog.debug('Clear expired dataset image end'); + }; + + setCron('*/10 * * * *', async () => { + if ( + await checkTimerLock({ + timerId: TimerIdEnum.clearExpiredDatasetImage, + lockMinuted: 9 + }) + ) { + try { + await clearExpiredDatasetImages(); + } catch (error) { + addLog.error('clearExpiredDatasetImageCron error', error); + } + } + }); +}; diff --git a/packages/service/core/dataset/image/schema.ts b/packages/service/core/dataset/image/schema.ts new file mode 100644 index 000000000..674c76891 --- /dev/null +++ b/packages/service/core/dataset/image/schema.ts @@ -0,0 +1,36 @@ +import type { Types } from '../../../common/mongo'; +import { getMongoModel, Schema } from '../../../common/mongo'; + +export const bucketName = 'dataset_image'; + +const MongoDatasetImage = new Schema({ + length: { type: Number, required: true }, + chunkSize: { type: Number, required: true }, + uploadDate: { type: Date, required: true }, + filename: { type: String, required: true }, + contentType: { type: String, required: true }, + metadata: { + teamId: { type: String, required: true }, + datasetId: { type: String, required: true }, + collectionId: { type: String }, + expiredTime: { type: Date, required: true } + } +}); +MongoDatasetImage.index({ 'metadata.datasetId': 'hashed' }); +MongoDatasetImage.index({ 'metadata.collectionId': 'hashed' }); +MongoDatasetImage.index({ 'metadata.expiredTime': -1 }); + +export const MongoDatasetImageSchema = getMongoModel<{ + _id: Types.ObjectId; + length: number; + chunkSize: number; + uploadDate: Date; + filename: string; + contentType: string; + metadata: { + teamId: string; + datasetId: string; + collectionId: string; + expiredTime: Date; + }; +}>(`${bucketName}.files`, MongoDatasetImage); diff --git a/packages/service/core/dataset/image/utils.ts b/packages/service/core/dataset/image/utils.ts new file mode 100644 index 000000000..1ff72da97 --- /dev/null +++ b/packages/service/core/dataset/image/utils.ts @@ -0,0 +1,101 @@ +import { ERROR_ENUM } from '@fastgpt/global/common/error/errorCode'; +import { Types, type ClientSession } from '../../../common/mongo'; +import { deleteDatasetImage } from './controller'; +import { MongoDatasetImageSchema } from './schema'; +import { addMinutes } from 'date-fns'; +import jwt from 'jsonwebtoken'; + +export const removeDatasetImageExpiredTime = async ({ + ids = [], + collectionId, + session +}: { + ids?: string[]; + collectionId: string; + session?: ClientSession; +}) => { + if (ids.length === 0) return; + return MongoDatasetImageSchema.updateMany( + { + _id: { + $in: ids + .filter((id) => Types.ObjectId.isValid(id)) + .map((id) => (typeof id === 'string' ? new Types.ObjectId(id) : id)) + } + }, + { + $unset: { 'metadata.expiredTime': '' }, + $set: { + 'metadata.collectionId': String(collectionId) + } + }, + { session } + ); +}; + +export const getDatasetImagePreviewUrl = ({ + imageId, + teamId, + datasetId, + expiredMinutes +}: { + imageId: string; + teamId: string; + datasetId: string; + expiredMinutes: number; +}) => { + const expiredTime = Math.floor(addMinutes(new Date(), expiredMinutes).getTime() / 1000); + + const key = (process.env.FILE_TOKEN_KEY as string) ?? 'filetoken'; + const token = jwt.sign( + { + teamId: String(teamId), + datasetId: String(datasetId), + exp: expiredTime + }, + key + ); + + return `/api/core/dataset/image/${imageId}?token=${token}`; +}; +export const authDatasetImagePreviewUrl = (token?: string) => + new Promise<{ + teamId: string; + datasetId: string; + }>((resolve, reject) => { + if (!token) { + return reject(ERROR_ENUM.unAuthFile); + } + const key = (process.env.FILE_TOKEN_KEY as string) ?? 'filetoken'; + + jwt.verify(token, key, (err, decoded: any) => { + if (err || !decoded?.teamId || !decoded?.datasetId) { + reject(ERROR_ENUM.unAuthFile); + return; + } + resolve({ + teamId: decoded.teamId, + datasetId: decoded.datasetId + }); + }); + }); + +export const clearDatasetImages = async (datasetIds: string[]) => { + const images = await MongoDatasetImageSchema.find( + { + 'metadata.datasetId': { $in: datasetIds.map((item) => String(item)) } + }, + '_id' + ).lean(); + await Promise.all(images.map((image) => deleteDatasetImage(String(image._id)))); +}; + +export const clearCollectionImages = async (collectionIds: string[]) => { + const images = await MongoDatasetImageSchema.find( + { + 'metadata.collectionId': { $in: collectionIds.map((item) => String(item)) } + }, + '_id' + ).lean(); + await Promise.all(images.map((image) => deleteDatasetImage(String(image._id)))); +}; diff --git a/packages/service/core/dataset/read.ts b/packages/service/core/dataset/read.ts index 647c05758..d4effdf17 100644 --- a/packages/service/core/dataset/read.ts +++ b/packages/service/core/dataset/read.ts @@ -186,9 +186,11 @@ export const rawText2Chunks = ({ chunkTriggerMinSize = 1000, backupParse, chunkSize = 512, + imageIdList, ...splitProps }: { rawText: string; + imageIdList?: string[]; chunkTriggerType?: ChunkTriggerConfigTypeEnum; chunkTriggerMinSize?: number; // maxSize from agent model, not store @@ -199,6 +201,7 @@ export const rawText2Chunks = ({ q: string; a: string; indexes?: string[]; + imageIdList?: string[]; }[] => { const parseDatasetBackup2Chunks = (rawText: string) => { const csvArr = Papa.parse(rawText).data as string[][]; @@ -209,7 +212,8 @@ export const rawText2Chunks = ({ .map((item) => ({ q: item[0] || '', a: item[1] || '', - indexes: item.slice(2) + indexes: item.slice(2), + imageIdList })) .filter((item) => item.q || item.a); @@ -231,7 +235,8 @@ export const rawText2Chunks = ({ return [ { q: rawText, - a: '' + a: '', + imageIdList } ]; } @@ -240,7 +245,7 @@ export const rawText2Chunks = ({ if (chunkTriggerType !== ChunkTriggerConfigTypeEnum.forceChunk) { const textLength = rawText.trim().length; if (textLength < chunkTriggerMinSize) { - return [{ q: rawText, a: '' }]; + return [{ q: rawText, a: '', imageIdList }]; } } @@ -253,6 +258,7 @@ export const rawText2Chunks = ({ return chunks.map((item) => ({ q: item, a: '', - indexes: [] + indexes: [], + imageIdList })); }; diff --git a/packages/service/core/dataset/search/controller.ts b/packages/service/core/dataset/search/controller.ts index 65861299e..ffd94eca4 100644 --- a/packages/service/core/dataset/search/controller.ts +++ b/packages/service/core/dataset/search/controller.ts @@ -28,6 +28,7 @@ import type { NodeInputKeyEnum } from '@fastgpt/global/core/workflow/constants'; import { datasetSearchQueryExtension } from './utils'; import type { RerankModelItemType } from '@fastgpt/global/core/ai/model.d'; import { addLog } from '../../../common/system/log'; +import { formatDatasetDataValue } from '../data/controller'; export type SearchDatasetDataProps = { histories: ChatItemType[]; @@ -175,6 +176,12 @@ export async function searchDatasetData( collectionFilterMatch } = props; + // Constants data + const datasetDataSelectField = + '_id datasetId collectionId updateTime q a imageId chunkIndex indexes'; + const datsaetCollectionSelectField = + '_id name fileId rawLink apiFileId externalFileId externalFileUrl'; + /* init params */ searchMode = DatasetSearchModeMap[searchMode] ? searchMode : DatasetSearchModeEnum.embedding; usingReRank = usingReRank && !!getDefaultRerankModel(); @@ -463,14 +470,14 @@ export async function searchDatasetData( collectionId: { $in: collectionIdList }, 'indexes.dataId': { $in: results.map((item) => item.id?.trim()) } }, - '_id datasetId collectionId updateTime q a chunkIndex indexes', + datasetDataSelectField, { ...readFromSecondary } ).lean(), MongoDatasetCollection.find( { _id: { $in: collectionIdList } }, - '_id name fileId rawLink apiFileId externalFileId externalFileUrl', + datsaetCollectionSelectField, { ...readFromSecondary } ).lean() ]); @@ -494,8 +501,13 @@ export async function searchDatasetData( const result: SearchDataResponseItemType = { id: String(data._id), updateTime: data.updateTime, - q: data.q, - a: data.a, + ...formatDatasetDataValue({ + teamId, + datasetId: data.datasetId, + q: data.q, + a: data.a, + imageId: data.imageId + }), chunkIndex: data.chunkIndex, datasetId: String(data.datasetId), collectionId: String(data.collectionId), @@ -597,14 +609,14 @@ export async function searchDatasetData( { _id: { $in: searchResults.map((item) => item.dataId) } }, - '_id datasetId collectionId updateTime q a chunkIndex indexes', + datasetDataSelectField, { ...readFromSecondary } ).lean(), MongoDatasetCollection.find( { _id: { $in: searchResults.map((item) => item.collectionId) } }, - '_id name fileId rawLink apiFileId externalFileId externalFileUrl', + datsaetCollectionSelectField, { ...readFromSecondary } ).lean() ]); @@ -630,8 +642,13 @@ export async function searchDatasetData( datasetId: String(data.datasetId), collectionId: String(data.collectionId), updateTime: data.updateTime, - q: data.q, - a: data.a, + ...formatDatasetDataValue({ + teamId, + datasetId: data.datasetId, + q: data.q, + a: data.a, + imageId: data.imageId + }), chunkIndex: data.chunkIndex, indexes: data.indexes, ...getCollectionSourceData(collection), diff --git a/packages/service/core/dataset/training/controller.ts b/packages/service/core/dataset/training/controller.ts index 30708cbe0..28bd97a7d 100644 --- a/packages/service/core/dataset/training/controller.ts +++ b/packages/service/core/dataset/training/controller.ts @@ -12,10 +12,7 @@ import { getCollectionWithDataset } from '../controller'; import { mongoSessionRun } from '../../../common/mongo/sessionRun'; import { type PushDataToTrainingQueueProps } from '@fastgpt/global/core/dataset/training/type'; import { i18nT } from '../../../../web/i18n/utils'; -import { - getLLMDefaultChunkSize, - getLLMMaxChunkSize -} from '../../../../global/core/dataset/training/utils'; +import { getLLMMaxChunkSize } from '../../../../global/core/dataset/training/utils'; export const lockTrainingDataByTeamId = async (teamId: string): Promise => { try { @@ -65,7 +62,7 @@ export async function pushDataListToTrainingQueue({ const getImageChunkMode = (data: PushDatasetDataChunkProps, mode: TrainingModeEnum) => { if (mode !== TrainingModeEnum.image) return mode; // 检查内容中,是否包含 ![](xxx) 的图片格式 - const text = data.q + data.a || ''; + const text = (data.q || '') + (data.a || ''); const regex = /!\[\]\((.*?)\)/g; const match = text.match(regex); if (match) { @@ -82,9 +79,6 @@ export async function pushDataListToTrainingQueue({ if (!agentModelData) { return Promise.reject(i18nT('common:error_llm_not_config')); } - if (mode === TrainingModeEnum.chunk || mode === TrainingModeEnum.auto) { - prompt = undefined; - } const { model, maxToken, weight } = await (async () => { if (mode === TrainingModeEnum.chunk) { @@ -101,7 +95,7 @@ export async function pushDataListToTrainingQueue({ weight: 0 }; } - if (mode === TrainingModeEnum.image) { + if (mode === TrainingModeEnum.image || mode === TrainingModeEnum.imageParse) { const vllmModelData = getVlmModel(vlmModel); if (!vllmModelData) { return Promise.reject(i18nT('common:error_vlm_not_config')); @@ -117,11 +111,9 @@ export async function pushDataListToTrainingQueue({ })(); // filter repeat or equal content - const set = new Set(); const filterResult: Record = { success: [], overToken: [], - repeat: [], error: [] }; @@ -140,7 +132,7 @@ export async function pushDataListToTrainingQueue({ .filter(Boolean); // filter repeat content - if (!item.q) { + if (!item.imageId && !item.q) { filterResult.error.push(item); return; } @@ -153,32 +145,26 @@ export async function pushDataListToTrainingQueue({ return; } - if (set.has(text)) { - filterResult.repeat.push(item); - } else { - filterResult.success.push(item); - set.add(text); - } + filterResult.success.push(item); }); // insert data to db const insertLen = filterResult.success.length; - const failedDocuments: PushDatasetDataChunkProps[] = []; // 使用 insertMany 批量插入 - const batchSize = 200; + const batchSize = 500; const insertData = async (startIndex: number, session: ClientSession) => { const list = filterResult.success.slice(startIndex, startIndex + batchSize); if (list.length === 0) return; try { - await MongoDatasetTraining.insertMany( + const result = await MongoDatasetTraining.insertMany( list.map((item) => ({ teamId, tmbId, - datasetId, - collectionId, + datasetId: datasetId, + collectionId: collectionId, billId, mode: getImageChunkMode(item, mode), prompt, @@ -189,25 +175,25 @@ export async function pushDataListToTrainingQueue({ indexSize, weight: weight ?? 0, indexes: item.indexes, - retryCount: 5 + retryCount: 5, + ...(item.imageId ? { imageId: item.imageId } : {}) })), { session, - ordered: true + ordered: false, + rawResult: true, + includeResultMetadata: false // 进一步减少返回数据 } ); + + if (result.insertedCount !== list.length) { + return Promise.reject(`Insert data error, ${JSON.stringify(result)}`); + } } catch (error: any) { addLog.error(`Insert error`, error); - // 如果有错误,将失败的文档添加到失败列表中 - error.writeErrors?.forEach((writeError: any) => { - failedDocuments.push(data[writeError.index]); - }); - console.log('failed', failedDocuments); + return Promise.reject(error); } - // 对于失败的文档,尝试单独插入 - await MongoDatasetTraining.create(failedDocuments, { session }); - return insertData(startIndex + batchSize, session); }; @@ -222,7 +208,6 @@ export async function pushDataListToTrainingQueue({ delete filterResult.success; return { - insertLen, - ...filterResult + insertLen }; } diff --git a/packages/service/core/dataset/training/schema.ts b/packages/service/core/dataset/training/schema.ts index 44f4e22db..1f8411f53 100644 --- a/packages/service/core/dataset/training/schema.ts +++ b/packages/service/core/dataset/training/schema.ts @@ -99,6 +99,9 @@ const TrainingDataSchema = new Schema({ ], default: [] }, + imageId: { + type: String + }, errorMsg: String }); diff --git a/packages/service/core/workflow/dispatch/chat/oneapi.ts b/packages/service/core/workflow/dispatch/chat/oneapi.ts index 84bd9ab77..c53ff2ba5 100644 --- a/packages/service/core/workflow/dispatch/chat/oneapi.ts +++ b/packages/service/core/workflow/dispatch/chat/oneapi.ts @@ -358,7 +358,7 @@ async function filterDatasetQuote({ return replaceVariable(quoteTemplate, { id: item.id, q: item.q, - a: item.a, + a: item.a || '', updateTime: formatTime2YMDHM(item.updateTime), source: item.sourceName, sourceId: String(item.sourceId || ''), diff --git a/packages/service/support/permission/dataset/auth.ts b/packages/service/support/permission/dataset/auth.ts index 579f5614a..74980aac6 100644 --- a/packages/service/support/permission/dataset/auth.ts +++ b/packages/service/support/permission/dataset/auth.ts @@ -16,6 +16,7 @@ import { type AuthModeType, type AuthResponseType } from '../type'; import { DatasetTypeEnum } from '@fastgpt/global/core/dataset/constants'; import { type ParentIdType } from '@fastgpt/global/common/parentFolder/type'; import { DatasetDefaultPermissionVal } from '@fastgpt/global/support/permission/dataset/constant'; +import { getDatasetImagePreviewUrl } from '../../../core/dataset/image/utils'; export const authDatasetByTmbId = async ({ tmbId, @@ -267,6 +268,15 @@ export async function authDatasetData({ updateTime: datasetData.updateTime, q: datasetData.q, a: datasetData.a, + imageId: datasetData.imageId, + imagePreivewUrl: datasetData.imageId + ? getDatasetImagePreviewUrl({ + imageId: datasetData.imageId, + teamId: datasetData.teamId, + datasetId: datasetData.datasetId, + expiredMinutes: 30 + }) + : undefined, chunkIndex: datasetData.chunkIndex, indexes: datasetData.indexes, datasetId: String(datasetData.datasetId), diff --git a/packages/service/worker/preload.ts b/packages/service/worker/preload.ts index d2ee467b8..f1d58ba30 100644 --- a/packages/service/worker/preload.ts +++ b/packages/service/worker/preload.ts @@ -1,7 +1,7 @@ import { getWorkerController, WorkerNameEnum } from './utils'; export const preLoadWorker = async () => { - const max = Number(global.systemEnv?.tokenWorkers || 30); + const max = Math.min(Number(global.systemEnv?.tokenWorkers || 30), 100); const workerController = getWorkerController({ name: WorkerNameEnum.countGptMessagesTokens, maxReservedThreads: max diff --git a/packages/web/components/common/Icon/constants.ts b/packages/web/components/common/Icon/constants.ts index 586420fe6..d0c53defa 100644 --- a/packages/web/components/common/Icon/constants.ts +++ b/packages/web/components/common/Icon/constants.ts @@ -220,9 +220,11 @@ export const iconPaths = { import('./icons/core/dataset/feishuDatasetOutline.svg'), 'core/dataset/fileCollection': () => import('./icons/core/dataset/fileCollection.svg'), 'core/dataset/fullTextRecall': () => import('./icons/core/dataset/fullTextRecall.svg'), + 'core/dataset/imageFill': () => import('./icons/core/dataset/imageFill.svg'), 'core/dataset/manualCollection': () => import('./icons/core/dataset/manualCollection.svg'), 'core/dataset/mixedRecall': () => import('./icons/core/dataset/mixedRecall.svg'), 'core/dataset/modeEmbedding': () => import('./icons/core/dataset/modeEmbedding.svg'), + 'core/dataset/otherDataset': () => import('./icons/core/dataset/otherDataset.svg'), 'core/dataset/questionExtension': () => import('./icons/core/dataset/questionExtension.svg'), 'core/dataset/rerank': () => import('./icons/core/dataset/rerank.svg'), 'core/dataset/searchfilter': () => import('./icons/core/dataset/searchfilter.svg'), @@ -230,7 +232,6 @@ export const iconPaths = { 'core/dataset/tableCollection': () => import('./icons/core/dataset/tableCollection.svg'), 'core/dataset/tag': () => import('./icons/core/dataset/tag.svg'), 'core/dataset/websiteDataset': () => import('./icons/core/dataset/websiteDataset.svg'), - 'core/dataset/otherDataset': () => import('./icons/core/dataset/otherDataset.svg'), 'core/dataset/websiteDatasetColor': () => import('./icons/core/dataset/websiteDatasetColor.svg'), 'core/dataset/websiteDatasetOutline': () => import('./icons/core/dataset/websiteDatasetOutline.svg'), @@ -379,10 +380,12 @@ export const iconPaths = { fullScreen: () => import('./icons/fullScreen.svg'), help: () => import('./icons/help.svg'), history: () => import('./icons/history.svg'), + image: () => import('./icons/image.svg'), infoRounded: () => import('./icons/infoRounded.svg'), kbTest: () => import('./icons/kbTest.svg'), key: () => import('./icons/key.svg'), keyPrimary: () => import('./icons/keyPrimary.svg'), + loading: () => import('./icons/loading.svg'), menu: () => import('./icons/menu.svg'), minus: () => import('./icons/minus.svg'), 'modal/AddClb': () => import('./icons/modal/AddClb.svg'), diff --git a/packages/web/components/common/Icon/icons/core/dataset/imageFill.svg b/packages/web/components/common/Icon/icons/core/dataset/imageFill.svg new file mode 100644 index 000000000..421c7c49a --- /dev/null +++ b/packages/web/components/common/Icon/icons/core/dataset/imageFill.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/packages/web/components/common/Icon/icons/image.svg b/packages/web/components/common/Icon/icons/image.svg new file mode 100644 index 000000000..94b529725 --- /dev/null +++ b/packages/web/components/common/Icon/icons/image.svg @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/packages/web/components/common/Icon/icons/loading.svg b/packages/web/components/common/Icon/icons/loading.svg new file mode 100644 index 000000000..10033653f --- /dev/null +++ b/packages/web/components/common/Icon/icons/loading.svg @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/packages/web/i18n/en/chat.json b/packages/web/i18n/en/chat.json index ff931d947..211800445 100644 --- a/packages/web/i18n/en/chat.json +++ b/packages/web/i18n/en/chat.json @@ -71,13 +71,13 @@ "response_embedding_model_tokens": "Vector Model Tokens", "response_hybrid_weight": "Embedding : Full text = {{emb}} : {{text}}", "response_rerank_tokens": "Rearrange Model Tokens", + "search_results": "Search results", "select": "Select", "select_file": "Upload File", "select_file_img": "Upload file / image", "select_img": "Upload Image", "source_cronJob": "Scheduled execution", "stream_output": "Stream Output", - "to_dataset": "Go to the Knowledge Base", "unsupported_file_type": "Unsupported file types", "upload": "Upload", "variable_invisable_in_share": "Custom variables are not visible in login-free links", diff --git a/packages/web/i18n/en/common.json b/packages/web/i18n/en/common.json index d30efe081..0b203e9a4 100644 --- a/packages/web/i18n/en/common.json +++ b/packages/web/i18n/en/common.json @@ -180,7 +180,7 @@ "code_error.user_error.balance_not_enough": "Insufficient Account Balance", "code_error.user_error.bin_visitor_guest": "You Are Currently a Guest, Unauthorized to Operate", "code_error.user_error.un_auth_user": "User Not Found", - "comfirm_import": "comfirm_import", + "comfirm_import": "Confirm import", "comfirm_leave_page": "Confirm to Leave This Page?", "comfirn_create": "Confirm Creation", "commercial_function_tip": "Please Upgrade to the Commercial Version to Use This Feature: https://doc.fastgpt.cn/docs/commercial/intro/", @@ -403,7 +403,6 @@ "core.chat.response.module model": "Model", "core.chat.response.module name": "Model Name", "core.chat.response.module query": "Question/Search Term", - "core.chat.response.module quoteList": "Quote Content", "core.chat.response.module similarity": "Similarity", "core.chat.response.module temperature": "Temperature", "core.chat.response.module time": "Run Time", @@ -434,7 +433,6 @@ "core.dataset.Text collection": "Text Dataset", "core.dataset.apiFile": "API File", "core.dataset.collection.Click top config website": "Click to Configure Website", - "core.dataset.collection.Collection name": "Dataset Name", "core.dataset.collection.Collection raw text": "Dataset Content", "core.dataset.collection.Empty Tip": "The Dataset is Empty", "core.dataset.collection.QA Prompt": "QA Split Prompt", @@ -451,7 +449,6 @@ "core.dataset.collection.metadata.metadata": "Metadata", "core.dataset.collection.metadata.read source": "View Original Content", "core.dataset.collection.metadata.source": "Data Source", - "core.dataset.collection.metadata.source name": "Source Name", "core.dataset.collection.metadata.source size": "Source Size", "core.dataset.collection.status.active": "Ready", "core.dataset.collection.status.error": "Error", @@ -743,7 +740,7 @@ "core.workflow.value": "Value", "core.workflow.variable": "Variable", "create": "Create", - "create_failed": "Creation Failed", + "create_failed": "Create failed", "create_success": "Created Successfully", "create_time": "Creation Time", "cron_job_run_app": "Scheduled Task", @@ -788,7 +785,6 @@ "dataset.dataset_name": "Dataset Name", "dataset.deleteFolderTips": "Confirm to Delete This Folder and All Its Contained Datasets? Data Cannot Be Recovered After Deletion, Please Confirm!", "dataset.test.noResult": "No Search Results", - "dataset_data_import_q_placeholder": "Up to {{maxToken}} words.", "dataset_data_input_a": "Answer", "dataset_data_input_chunk": "Chunk", "dataset_data_input_chunk_content": "Chunk", @@ -802,7 +798,6 @@ "delete_success": "Deleted Successfully", "delete_warning": "Deletion Warning", "embedding_model_not_config": "No index model is detected", - "error.Create failed": "Create failed", "error.code_error": "Verification code error", "error.fileNotFound": "File not found~", "error.inheritPermissionError": "Inherit permission Error", @@ -1208,6 +1203,7 @@ "templateTags.Writing": "Writing", "template_market": "Template Market", "textarea_variable_picker_tip": "Enter \"/\" to select a variable", + "to_dataset": "To dataset", "ui.textarea.Magnifying": "Magnifying", "un_used": "Unused", "unauth_token": "The certificate has expired, please log in again", diff --git a/packages/web/i18n/en/dataset.json b/packages/web/i18n/en/dataset.json index 6a7b1e560..bf3168c2a 100644 --- a/packages/web/i18n/en/dataset.json +++ b/packages/web/i18n/en/dataset.json @@ -28,16 +28,21 @@ "collection.training_type": "Chunk type", "collection_data_count": "Data amount", "collection_metadata_custom_pdf_parse": "PDF enhancement analysis", + "collection_name": "Collection name", "collection_not_support_retraining": "This collection type does not support retuning parameters", "collection_not_support_sync": "This collection does not support synchronization", "collection_sync": "Sync data", "collection_sync_confirm_tip": "Confirm to start synchronizing data? \nThe system will pull the latest data for comparison. If the contents are different, a new collection will be created and the old collection will be deleted. Please confirm!", "collection_tags": "Collection Tags", + "common.dataset.data.Input Error Tip": "[Image Dataset] Process error:", + "common.error.unKnow": "Unknown error", "common_dataset": "General Dataset", "common_dataset_desc": "Building a knowledge base by importing files, web page links, or manual entry", "condition": "condition", "config_sync_schedule": "Configure scheduled synchronization", + "confirm_import_images": "Total {{num}} | Confirm create", "confirm_to_rebuild_embedding_tip": "Are you sure you want to switch the index for the Dataset?\nSwitching the index is a significant operation that requires re-indexing all data in your Dataset, which may take a long time. Please ensure your account has sufficient remaining points.\n\nAdditionally, you need to update the applications that use this Dataset to avoid conflicts with other indexed model Datasets.", + "core.dataset.Image collection": "Image dataset", "core.dataset.import.Adjust parameters": "Adjust parameters", "custom_data_process_params": "Custom", "custom_data_process_params_desc": "Customize data processing rules", @@ -90,6 +95,7 @@ "image_auto_parse": "Automatic image indexing", "image_auto_parse_tips": "Call VLM to automatically label the pictures in the document and generate additional search indexes", "image_training_queue": "Queue of image processing", + "images_creating": "Creating", "immediate_sync": "Immediate Synchronization", "import.Auto mode Estimated Price Tips": "The text understanding model needs to be called, which requires more points: {{price}} points/1K tokens", "import.Embedding Estimated Price Tips": "Only use the index model and consume a small amount of AI points: {{price}} points/1K tokens", @@ -104,6 +110,8 @@ "index_size": "Index size", "index_size_tips": "When vectorized, the system will automatically further segment the blocks according to this size.", "input_required_field_to_select_baseurl": "Please enter the required information first", + "insert_images": "Added pictures", + "insert_images_success": "The new picture is successfully added, and you need to wait for the training to be completed before it will be displayed.", "is_open_schedule": "Enable scheduled synchronization", "keep_image": "Keep the picture", "loading": "Loading...", @@ -135,6 +143,7 @@ "process.Image_Index": "Image index generation", "process.Is_Ready": "Ready", "process.Is_Ready_Count": "{{count}} Group is ready", + "process.Parse_Image": "Image analysis", "process.Parsing": "Parsing", "process.Vectorizing": "Index vectorization", "process.Waiting": "Queue", @@ -179,13 +188,19 @@ "training.Error": "{{count}} Group exception", "training.Normal": "Normal", "training_mode": "Chunk mode", + "training_queue_tip": "Training queue status", "training_ready": "{{count}} Group", + "uploading_progress": "Uploading: {{num}}%", "vector_model_max_tokens_tip": "Each chunk of data has a maximum length of 3000 tokens", + "vector_training_queue": "Vector training queue", "vllm_model": "Image understanding model", + "vlm_model_required_tooltip": "A Vision Language Model is required to create image collections", + "vlm_model_required_warning": "Image datasets require a Vision Language Model (VLM) to be configured. Please add a model that supports image understanding in the model configuration first.", + "waiting_for_training": "Waiting for training", "website_dataset": "Website Sync", "website_dataset_desc": "Build knowledge base by crawling web page data in batches", "website_info": "Website Information", - "yuque_dataset": "Yuque Dataset", - "yuque_dataset_config": "Yuque Dataset Config", - "yuque_dataset_desc": "Can build a dataset using Yuque documents by configuring permissions, without secondary storage" + "yuque_dataset": "Yuque Knowledge Base", + "yuque_dataset_config": "Configure Yuque Knowledge Base", + "yuque_dataset_desc": "Build knowledge base using Yuque documents by configuring document permissions, documents will not be stored twice" } diff --git a/packages/web/i18n/en/file.json b/packages/web/i18n/en/file.json index 2bd7a3061..f84b203dc 100644 --- a/packages/web/i18n/en/file.json +++ b/packages/web/i18n/en/file.json @@ -1,9 +1,32 @@ { + "Action": "Please select the image to upload", + "All images import failed": "All pictures failed to import", + "Dataset_ID_not_found": "The dataset ID does not exist", + "Failed_to_get_token": "Failed to obtain the token", + "Image_ID_copied": "Copy ID", + "Image_Preview": "Picture preview", + "Image_dataset_requires_VLM_model_to_be_configured": "The image dataset needs to be configured with the image understanding model (VLM) to be used. Please add a model that supports image understanding in the model configuration first.", + "Image_does_not_belong_to_current_team": "The picture does not belong to the current team", + "Image_file_does_not_exist": "The picture does not exist", + "Loading_image": "Loading the picture...", + "Loading_image failed": "Preview loading failed", + "Only_support_uploading_one_image": "Only support uploading one image", + "Please select the image to upload": "Please select the image to upload", + "Please select the image to upload select the image to upload": "", + "Please wait for all files to upload": "Please wait for all files to be uploaded to complete", "bucket_chat": "Conversation Files", "bucket_file": "Dataset Documents", "click_to_view_raw_source": "Click to View Original Source", + "common.dataset_data_input_image_support_format": "Support .jpg, .jpeg, .png, .gif, .webp formats", + "delete_image": "Delete pictures", "file_name": "Filename", "file_size": "Filesize", + "image": "picture", + "image_collection": "Picture collection", + "image_description": "Image description", + "image_description_tip": "Please enter the description of the picture", + "please_upload_image_first": "Please upload the picture first", + "reached_max_file_count": "Maximum file count reached", "release_the_mouse_to_upload_the_file": "Release Mouse to Upload File", "select_and_drag_file_tip": "Click or Drag Files Here to Upload", "select_file_amount_limit": "You can select up to {{max}} files", @@ -12,7 +35,9 @@ "support_file_type": "Supports {{fileType}} file types", "support_max_count": "Supports up to {{maxCount}} files", "support_max_size": "Maximum file size is {{maxSize}}", + "total_files": "Total {{selectFiles.length}} files", + "upload_error_description": "Only multiple files or a single folder can be uploaded at a time", "upload_failed": "Upload Failed", - "reached_max_file_count": "Maximum file count reached", - "upload_error_description": "Only multiple files or a single folder can be uploaded at a time" -} \ No newline at end of file + "upload_file_error": "Please upload pictures", + "uploading": "Uploading..." +} diff --git a/packages/web/i18n/zh-CN/chat.json b/packages/web/i18n/zh-CN/chat.json index 89d37dcd6..87307af07 100644 --- a/packages/web/i18n/zh-CN/chat.json +++ b/packages/web/i18n/zh-CN/chat.json @@ -71,13 +71,13 @@ "response_embedding_model_tokens": "向量模型 Tokens", "response_hybrid_weight": "语义检索 : 全文检索 = {{emb}} : {{text}}", "response_rerank_tokens": "重排模型 Tokens", + "search_results": "搜索结果", "select": "选择", "select_file": "上传文件", "select_file_img": "上传文件/图片", "select_img": "上传图片", "source_cronJob": "定时执行", "stream_output": "流输出", - "to_dataset": "前往知识库", "unsupported_file_type": "不支持的文件类型", "upload": "上传", "variable_invisable_in_share": "自定义变量在免登录链接中不可见", diff --git a/packages/web/i18n/zh-CN/common.json b/packages/web/i18n/zh-CN/common.json index c583992d4..2746a6732 100644 --- a/packages/web/i18n/zh-CN/common.json +++ b/packages/web/i18n/zh-CN/common.json @@ -403,7 +403,6 @@ "core.chat.response.module model": "模型", "core.chat.response.module name": "模型名", "core.chat.response.module query": "问题/检索词", - "core.chat.response.module quoteList": "引用内容", "core.chat.response.module similarity": "相似度", "core.chat.response.module temperature": "温度", "core.chat.response.module time": "运行时长", @@ -434,7 +433,6 @@ "core.dataset.Text collection": "文本数据集", "core.dataset.apiFile": "API 文件", "core.dataset.collection.Click top config website": "点击配置网站", - "core.dataset.collection.Collection name": "数据集名称", "core.dataset.collection.Collection raw text": "数据集内容", "core.dataset.collection.Empty Tip": "数据集空空如也", "core.dataset.collection.QA Prompt": "QA 拆分引导词", @@ -451,7 +449,6 @@ "core.dataset.collection.metadata.metadata": "元数据", "core.dataset.collection.metadata.read source": "查看原始内容", "core.dataset.collection.metadata.source": "数据来源", - "core.dataset.collection.metadata.source name": "来源名", "core.dataset.collection.metadata.source size": "来源大小", "core.dataset.collection.status.active": "已就绪", "core.dataset.collection.status.error": "训练异常", @@ -743,7 +740,7 @@ "core.workflow.value": "值", "core.workflow.variable": "变量", "create": "去创建", - "create_failed": "创建异常", + "create_failed": "创建失败", "create_success": "创建成功", "create_time": "创建时间", "cron_job_run_app": "定时任务", @@ -788,7 +785,6 @@ "dataset.dataset_name": "知识库名称", "dataset.deleteFolderTips": "确认删除该文件夹及其包含的所有知识库?删除后数据无法恢复,请确认!", "dataset.test.noResult": "搜索结果为空", - "dataset_data_import_q_placeholder": "最多 {{maxToken}} 字。", "dataset_data_input_a": "答案", "dataset_data_input_chunk": "常规模式", "dataset_data_input_chunk_content": "内容", @@ -802,7 +798,6 @@ "delete_success": "删除成功", "delete_warning": "删除警告", "embedding_model_not_config": "检测到没有可用的索引模型", - "error.Create failed": "创建失败", "error.code_error": "验证码错误", "error.fileNotFound": "文件找不到了~", "error.inheritPermissionError": "权限继承错误", @@ -1208,6 +1203,7 @@ "templateTags.Writing": "文本创作", "template_market": "模板市场", "textarea_variable_picker_tip": "输入\"/\"可选择变量", + "to_dataset": "前往知识库", "ui.textarea.Magnifying": "放大", "un_used": "未使用", "unauth_token": "凭证已过期,请重新登录", diff --git a/packages/web/i18n/zh-CN/dataset.json b/packages/web/i18n/zh-CN/dataset.json index 455343d17..2ed907486 100644 --- a/packages/web/i18n/zh-CN/dataset.json +++ b/packages/web/i18n/zh-CN/dataset.json @@ -28,16 +28,21 @@ "collection.training_type": "处理模式", "collection_data_count": "数据量", "collection_metadata_custom_pdf_parse": "PDF增强解析", + "collection_name": "数据集名称", "collection_not_support_retraining": "该集合类型不支持重新调整参数", "collection_not_support_sync": "该集合不支持同步", "collection_sync": "立即同步", "collection_sync_confirm_tip": "确认开始同步数据?系统将会拉取最新数据进行比较,如果内容不相同,则会创建一个新的集合并删除旧的集合,请确认!", "collection_tags": "集合标签", + "common.dataset.data.Input Error Tip": "[图片数据集] 处理过程错误:", + "common.error.unKnow": "未知错误", "common_dataset": "通用知识库", "common_dataset_desc": "通过导入文件、网页链接或手动录入形式构建知识库", "condition": "条件", "config_sync_schedule": "配置定时同步", + "confirm_import_images": "共 {{num}} 张图片 | 确认创建", "confirm_to_rebuild_embedding_tip": "确认为知识库切换索引?\n切换索引是一个非常重量的操作,需要对您知识库内所有数据进行重新索引,时间可能较长,请确保账号内剩余积分充足。\n\n此外,你还需要注意修改选择该知识库的应用,避免它们与其他索引模型知识库混用。", + "core.dataset.Image collection": "图片数据集", "core.dataset.import.Adjust parameters": "调整参数", "custom_data_process_params": "自定义", "custom_data_process_params_desc": "自定义设置数据处理规则", @@ -90,6 +95,7 @@ "image_auto_parse": "图片自动索引", "image_auto_parse_tips": "调用 VLM 自动标注文档里的图片,并生成额外的检索索引", "image_training_queue": "图片处理排队", + "images_creating": "正在创建", "immediate_sync": "立即同步", "import.Auto mode Estimated Price Tips": "需调用文本理解模型,需要消耗较多AI 积分:{{price}} 积分/1K tokens", "import.Embedding Estimated Price Tips": "仅使用索引模型,消耗少量 AI 积分:{{price}} 积分/1K tokens", @@ -104,6 +110,8 @@ "index_size": "索引大小", "index_size_tips": "向量化时内容的长度,系统会自动按该大小对分块进行进一步的分割。", "input_required_field_to_select_baseurl": "请先输入必填信息", + "insert_images": "新增图片", + "insert_images_success": "新增图片成功,需等待训练完成才会展示", "is_open_schedule": "启用定时同步", "keep_image": "保留图片", "loading": "加载中...", @@ -135,6 +143,7 @@ "process.Image_Index": "图片索引生成", "process.Is_Ready": "已就绪", "process.Is_Ready_Count": "{{count}} 组已就绪", + "process.Parse_Image": "图片解析中", "process.Parsing": "内容解析中", "process.Vectorizing": "索引向量化", "process.Waiting": "排队中", @@ -176,11 +185,14 @@ "the_knowledge_base_has_indexes_that_are_being_trained_or_being_rebuilt": "知识库有训练中或正在重建的索引", "total_num_files": "共 {{total}} 个文件", "training.Error": "{{count}} 组异常", + "training.Image mode": "图片处理", "training.Normal": "正常", "training_mode": "处理方式", "training_ready": "{{count}} 组", + "uploading_progress": "上传中: {{num}}%", "vector_model_max_tokens_tip": "每个分块数据,最大长度为 3000 tokens", "vllm_model": "图片理解模型", + "vlm_model_required_warning": "需要图片理解模型", "website_dataset": "Web 站点同步", "website_dataset_desc": "通过爬虫,批量爬取网页数据构建知识库", "website_info": "网站信息", diff --git a/packages/web/i18n/zh-CN/file.json b/packages/web/i18n/zh-CN/file.json index 839fe4c82..7bf03f213 100644 --- a/packages/web/i18n/zh-CN/file.json +++ b/packages/web/i18n/zh-CN/file.json @@ -1,9 +1,33 @@ { + "Action": "请选择要上传的图片", + "All images import failed": "所有图片导入失败", + "Dataset_ID_not_found": "数据集ID不存在", + "Failed_to_get_token": "获取令牌失败", + "Image_ID_copied": "已复制ID", + "Image_Preview": "图片预览", + "Image_dataset_requires_VLM_model_to_be_configured": "图片数据集需要配置图片理解模型(VLM)才能使用,请先在模型配置中添加支持图片理解的模型", + "Image_does_not_belong_to_current_team": "图片不属于当前团队", + "Image_file_does_not_exist": "图片不存在", + "Loading_image": "加载图片中...", + "Loading_image failed": "预览加载失败", + "Only_support_uploading_one_image": "仅支持上传一张图片", + "image_description_tip": "请输入图片的描述内容", + "Please select the image to upload": "请选择要上传的图片", + "Please wait for all files to upload": "请等待所有文件上传完成", "bucket_chat": "对话文件", "bucket_file": "知识库文件", "click_to_view_raw_source": "点击查看来源", + "common.Some images failed to process": "部分图片处理失败", + "common.dataset_data_input_image_support_format": "支持 .jpg, .jpeg, .png, .gif, .webp 格式", + "count.core.dataset.collection.Create Success": "成功导入 {{count}} 张图片", + "delete_image": "删除图片", "file_name": "文件名", "file_size": "文件大小", + "image": "图片", + "image_collection": "图片集合", + "image_description": "图片描述", + "please_upload_image_first": "请先上传图片", + "reached_max_file_count": "已达到最大文件数量", "release_the_mouse_to_upload_the_file": "松开鼠标上传文件", "select_and_drag_file_tip": "点击或拖动文件到此处上传", "select_file_amount_limit": "最多选择 {{max}} 个文件", @@ -12,7 +36,9 @@ "support_file_type": "支持 {{fileType}} 类型文件", "support_max_count": "最多支持 {{maxCount}} 个文件", "support_max_size": "单个文件最大 {{maxSize}}", + "total_files": "共{{selectFiles.length}}个文件", + "upload_error_description": "单次只支持上传多个文件或者一个文件夹", "upload_failed": "上传异常", - "reached_max_file_count": "已达到最大文件数量", - "upload_error_description": "单次只支持上传多个文件或者一个文件夹" -} \ No newline at end of file + "upload_file_error": "请上传图片", + "uploading": "正在上传..." +} diff --git a/packages/web/i18n/zh-Hant/chat.json b/packages/web/i18n/zh-Hant/chat.json index 53164268f..34ff1f7b3 100644 --- a/packages/web/i18n/zh-Hant/chat.json +++ b/packages/web/i18n/zh-Hant/chat.json @@ -71,13 +71,13 @@ "response_embedding_model_tokens": "向量模型 Tokens", "response_hybrid_weight": "語義檢索 : 全文檢索 = {{emb}} : {{text}}", "response_rerank_tokens": "重排模型 Tokens", + "search_results": "搜索結果", "select": "選取", "select_file": "上傳檔案", "select_file_img": "上傳檔案 / 圖片", "select_img": "上傳圖片", "source_cronJob": "定時執行", "stream_output": "串流輸出", - "to_dataset": "前往知識庫", "unsupported_file_type": "不支援的檔案類型", "upload": "上傳", "variable_invisable_in_share": "自定義變數在免登入連結中不可見", diff --git a/packages/web/i18n/zh-Hant/common.json b/packages/web/i18n/zh-Hant/common.json index 2fed428b3..a0c8cd7c1 100644 --- a/packages/web/i18n/zh-Hant/common.json +++ b/packages/web/i18n/zh-Hant/common.json @@ -403,7 +403,6 @@ "core.chat.response.module model": "模型", "core.chat.response.module name": "模型名稱", "core.chat.response.module query": "問題/搜尋詞", - "core.chat.response.module quoteList": "引用內容", "core.chat.response.module similarity": "相似度", "core.chat.response.module temperature": "溫度", "core.chat.response.module time": "執行時長", @@ -434,7 +433,6 @@ "core.dataset.Text collection": "文字資料集", "core.dataset.apiFile": "API 檔案", "core.dataset.collection.Click top config website": "點選設定網站", - "core.dataset.collection.Collection name": "資料集名稱", "core.dataset.collection.Collection raw text": "資料集內容", "core.dataset.collection.Empty Tip": "資料集是空的", "core.dataset.collection.QA Prompt": "問答拆分提示詞", @@ -451,7 +449,6 @@ "core.dataset.collection.metadata.metadata": "中繼資料", "core.dataset.collection.metadata.read source": "檢視原始內容", "core.dataset.collection.metadata.source": "資料來源", - "core.dataset.collection.metadata.source name": "來源名稱", "core.dataset.collection.metadata.source size": "來源大小", "core.dataset.collection.status.active": "已就緒", "core.dataset.collection.status.error": "訓練異常", @@ -555,7 +552,7 @@ "core.dataset.training.Agent queue": "問答訓練排隊中", "core.dataset.training.Auto mode": "補充索引", "core.dataset.training.Auto mode Tip": "透過子索引以及呼叫模型產生相關問題與摘要,來增加資料區塊的語意豐富度,更有利於檢索。需要消耗更多的儲存空間並增加 AI 呼叫次數。", - "core.dataset.training.Chunk mode": "分塊存儲", + "core.dataset.training.Chunk mode": "分塊儲存", "core.dataset.training.Full": "預計 20 分鐘以上", "core.dataset.training.Leisure": "閒置", "core.dataset.training.QA mode": "問答對提取", @@ -788,7 +785,6 @@ "dataset.dataset_name": "知識庫名稱", "dataset.deleteFolderTips": "確認刪除此資料夾及其包含的所有知識庫?刪除後資料無法復原,請確認!", "dataset.test.noResult": "搜尋結果為空", - "dataset_data_import_q_placeholder": "最多 {{maxToken}} 字。", "dataset_data_input_a": "答案", "dataset_data_input_chunk": "常規模式", "dataset_data_input_chunk_content": "內容", @@ -802,7 +798,6 @@ "delete_success": "刪除成功", "delete_warning": "刪除警告", "embedding_model_not_config": "偵測到沒有可用的索引模型", - "error.Create failed": "建立失敗", "error.code_error": "驗證碼錯誤", "error.fileNotFound": "找不到檔案", "error.inheritPermissionError": "繼承權限錯誤", @@ -1208,6 +1203,7 @@ "templateTags.Writing": "文字創作", "template_market": "模板市場", "textarea_variable_picker_tip": "輸入「/」以選擇變數", + "to_dataset": "前往知識庫", "ui.textarea.Magnifying": "放大", "un_used": "未使用", "unauth_token": "憑證已過期,請重新登入", diff --git a/packages/web/i18n/zh-Hant/dataset.json b/packages/web/i18n/zh-Hant/dataset.json index 463d81a6e..23ecd5ccd 100644 --- a/packages/web/i18n/zh-Hant/dataset.json +++ b/packages/web/i18n/zh-Hant/dataset.json @@ -26,16 +26,21 @@ "collection.training_type": "處理模式", "collection_data_count": "資料量", "collection_metadata_custom_pdf_parse": "PDF 增強解析", + "collection_name": "數據集名稱", "collection_not_support_retraining": "此集合類型不支援重新調整參數", "collection_not_support_sync": "該集合不支援同步", "collection_sync": "立即同步", "collection_sync_confirm_tip": "確認開始同步資料?\n系統將會拉取最新資料進行比較,如果內容不相同,則會建立一個新的集合並刪除舊的集合,請確認!", "collection_tags": "集合標籤", + "common.dataset.data.Input Error Tip": "[圖片數據集] 處理過程錯誤:", + "common.error.unKnow": "未知錯誤", "common_dataset": "通用資料集", "common_dataset_desc": "通過導入文件、網頁鏈接或手動錄入形式構建知識庫", "condition": "條件", "config_sync_schedule": "設定定時同步", + "confirm_import_images": "共 {{num}} 張圖片 | 確認創建", "confirm_to_rebuild_embedding_tip": "確定要為資料集切換索引嗎?\n切換索引是一個重要的操作,需要對您資料集內所有資料重新建立索引,可能需要較長時間,請確保帳號內剩餘點數充足。\n\n此外,您還需要注意修改使用此資料集的應用程式,避免與其他索引模型資料集混用。", + "core.dataset.Image collection": "圖片數據集", "core.dataset.import.Adjust parameters": "調整參數", "custom_data_process_params": "自訂", "custom_data_process_params_desc": "自訂資料處理規則", @@ -88,6 +93,7 @@ "image_auto_parse": "圖片自動索引", "image_auto_parse_tips": "呼叫 VLM 自動標註文件裡的圖片,並生成額外的檢索索引", "image_training_queue": "圖片處理排隊", + "images_creating": "正在創建", "immediate_sync": "立即同步", "import.Auto mode Estimated Price Tips": "需呼叫文字理解模型,將消耗較多 AI 點數:{{price}} 點數 / 1K tokens", "import.Embedding Estimated Price Tips": "僅使用索引模型,消耗少量 AI 點數:{{price}} 點數 / 1K tokens", @@ -102,6 +108,8 @@ "index_size": "索引大小", "index_size_tips": "向量化時內容的長度,系統會自動按該大小對分塊進行進一步的分割。", "input_required_field_to_select_baseurl": "請先輸入必填信息", + "insert_images": "新增圖片", + "insert_images_success": "新增圖片成功,需等待訓練完成才會展示", "is_open_schedule": "啟用定時同步", "keep_image": "保留圖片", "loading": "加載中...", @@ -133,6 +141,7 @@ "process.Image_Index": "圖片索引生成", "process.Is_Ready": "已就緒", "process.Is_Ready_Count": "{{count}} 組已就緒", + "process.Parse_Image": "圖片解析中", "process.Parsing": "內容解析中", "process.Vectorizing": "索引向量化", "process.Waiting": "排隊中", @@ -174,11 +183,13 @@ "the_knowledge_base_has_indexes_that_are_being_trained_or_being_rebuilt": "資料集有索引正在訓練或重建中", "total_num_files": "共 {{total}} 個文件", "training.Error": "{{count}} 組異常", + "training.Image mode": "圖片處理", "training.Normal": "正常", "training_mode": "分段模式", "training_ready": "{{count}} 組", "vector_model_max_tokens_tip": "每個分塊資料,最大長度為 3000 tokens", "vllm_model": "圖片理解模型", + "vlm_model_required_warning": "需要圖片理解模型", "website_dataset": "網站同步", "website_dataset_desc": "通過爬蟲,批量爬取網頁數據構建知識庫", "website_info": "網站資訊", diff --git a/packages/web/i18n/zh-Hant/file.json b/packages/web/i18n/zh-Hant/file.json index bb297ce9b..d7ea49d12 100644 --- a/packages/web/i18n/zh-Hant/file.json +++ b/packages/web/i18n/zh-Hant/file.json @@ -1,9 +1,31 @@ { + "Action": "請選擇要上傳的圖片", + "All images import failed": "所有圖片導入失敗", + "Dataset_ID_not_found": "數據集ID不存在", + "Failed_to_get_token": "獲取令牌失敗", + "Image_ID_copied": "已復制ID", + "Image_Preview": "圖片預覽", + "Image_dataset_requires_VLM_model_to_be_configured": "圖片數據集需要配置圖片理解模型(VLM)才能使用,請先在模型配置中添加支持圖片理解的模型", + "Image_does_not_belong_to_current_team": "圖片不屬於當前團隊", + "Image_file_does_not_exist": "圖片不存在", + "Loading_image": "加載圖片中...", + "Loading_image_failed": "預覽加載失敗", + "Only_support_uploading_one_image": "僅支持上傳一張圖片", + "image_description_tip": "請輸入圖片的描述內容", + "Please select the image to upload": "請選擇要上傳的圖片", + "Please select the image to upload select the image to upload": "", + "Please wait for all files to upload": "請等待所有文件上傳完成", "bucket_chat": "對話檔案", "bucket_file": "知識庫檔案", "click_to_view_raw_source": "點選檢視原始來源", + "dataset_data_input_image_support_format": "支持 .jpg, .jpeg, .png, .gif, .webp 格式", + "delete_image": "刪除圖片", "file_name": "檔案名稱", "file_size": "檔案大小", + "image": "圖片", + "image_collection": "圖片集合", + "please_upload_image_first": "請先上傳圖片", + "reached_max_file_count": "已達檔案數量上限", "release_the_mouse_to_upload_the_file": "放開滑鼠以上傳檔案", "select_and_drag_file_tip": "點選或拖曳檔案至此處上傳", "select_file_amount_limit": "最多可選擇 {{max}} 個檔案", @@ -12,7 +34,9 @@ "support_file_type": "支援 {{fileType}} 格式的檔案", "support_max_count": "最多可支援 {{maxCount}} 個檔案", "support_max_size": "單一檔案大小上限為 {{maxSize}}", + "total_files": "共{{selectFiles.length}}個文件", + "upload_error_description": "單次僅支援上傳多個檔案或一個資料夾", "upload_failed": "上傳失敗", - "reached_max_file_count": "已達檔案數量上限", - "upload_error_description": "單次僅支援上傳多個檔案或一個資料夾" -} \ No newline at end of file + "upload_file_error": "請上傳圖片", + "uploading": "正在上傳..." +} diff --git a/projects/app/src/components/MyImage/index.tsx b/projects/app/src/components/MyImage/index.tsx index da95109c0..9c9f58d47 100644 --- a/projects/app/src/components/MyImage/index.tsx +++ b/projects/app/src/components/MyImage/index.tsx @@ -3,38 +3,28 @@ import { Skeleton, type ImageProps } from '@chakra-ui/react'; import CustomImage from '@fastgpt/web/components/common/Image/MyImage'; export const MyImage = (props: ImageProps) => { - const [isLoading, setIsLoading] = useState(true); const [succeed, setSucceed] = useState(false); + return ( - - { - setIsLoading(false); - setSucceed(true); - }} - onError={() => setIsLoading(false)} - onClick={() => { - if (!succeed) return; - window.open(props.src, '_blank'); - }} - {...props} - /> - + { + setSucceed(true); + }} + onClick={() => { + if (!succeed) return; + window.open(props.src, '_blank'); + }} + {...props} + /> ); }; diff --git a/projects/app/src/components/common/NextHead/index.tsx b/projects/app/src/components/common/NextHead/index.tsx index d6210cff4..a5cbf11f9 100644 --- a/projects/app/src/components/common/NextHead/index.tsx +++ b/projects/app/src/components/common/NextHead/index.tsx @@ -18,7 +18,7 @@ const NextHead = ({ title, icon, desc }: { title?: string; icon?: string; desc?: name="viewport" content="width=device-width,initial-scale=1.0,maximum-scale=1.0,minimum-scale=1.0,user-scalable=no, viewport-fit=cover" /> - + {desc && } {icon && } diff --git a/projects/app/src/components/core/app/InputGuideConfig.tsx b/projects/app/src/components/core/app/InputGuideConfig.tsx index 0af949c53..f773d7225 100644 --- a/projects/app/src/components/core/app/InputGuideConfig.tsx +++ b/projects/app/src/components/core/app/InputGuideConfig.tsx @@ -240,7 +240,7 @@ const LexiconConfigModal = ({ appId, onClose }: { appId: string; onClose: () => onSuccess() { setNewData(undefined); }, - errorToast: t('common:error.Create failed') + errorToast: t('common:create_failed') } ); diff --git a/projects/app/src/components/core/chat/ChatContainer/ChatBox/components/QuoteList.tsx b/projects/app/src/components/core/chat/ChatContainer/ChatBox/components/QuoteList.tsx index cf1cf33c0..4026f8608 100644 --- a/projects/app/src/components/core/chat/ChatContainer/ChatBox/components/QuoteList.tsx +++ b/projects/app/src/components/core/chat/ChatContainer/ChatBox/components/QuoteList.tsx @@ -57,11 +57,12 @@ const QuoteList = React.memo(function QuoteList({ return { ...item, q: currentFilterItem?.q || '', - a: currentFilterItem?.a || '' + a: currentFilterItem?.a || '', + imagePreivewUrl: currentFilterItem?.imagePreivewUrl }; } - return { ...item, q: item.q || '', a: item.a || '' }; + return { ...item, q: item.q || '' }; }); return processedData.sort((a, b) => { @@ -87,6 +88,7 @@ const QuoteList = React.memo(function QuoteList({ diff --git a/projects/app/src/components/core/chat/ChatContainer/ChatBox/components/ResponseTags.tsx b/projects/app/src/components/core/chat/ChatContainer/ChatBox/components/ResponseTags.tsx index 65f3580f8..4202b76fb 100644 --- a/projects/app/src/components/core/chat/ChatContainer/ChatBox/components/ResponseTags.tsx +++ b/projects/app/src/components/core/chat/ChatContainer/ChatBox/components/ResponseTags.tsx @@ -81,7 +81,9 @@ const ResponseTags = ({ .map((item) => ({ sourceName: item.sourceName, sourceId: item.sourceId, - icon: getSourceNameIcon({ sourceId: item.sourceId, sourceName: item.sourceName }), + icon: item.imageId + ? 'core/dataset/imageFill' + : getSourceNameIcon({ sourceId: item.sourceId, sourceName: item.sourceName }), collectionId: item.collectionId, datasetId: item.datasetId })); diff --git a/projects/app/src/components/core/chat/components/WholeResponseModal.tsx b/projects/app/src/components/core/chat/components/WholeResponseModal.tsx index 2da6dbc8a..bcb91de9a 100644 --- a/projects/app/src/components/core/chat/components/WholeResponseModal.tsx +++ b/projects/app/src/components/core/chat/components/WholeResponseModal.tsx @@ -300,7 +300,7 @@ export const WholeResponseContent = ({ {activeModule.quoteList && activeModule.quoteList.length > 0 && ( } /> )} diff --git a/projects/app/src/components/core/dataset/QuoteItem.tsx b/projects/app/src/components/core/dataset/QuoteItem.tsx index 2fcb9803b..bbf79e9d2 100644 --- a/projects/app/src/components/core/dataset/QuoteItem.tsx +++ b/projects/app/src/components/core/dataset/QuoteItem.tsx @@ -8,7 +8,11 @@ import { useTranslation } from 'next-i18next'; import MyTooltip from '@fastgpt/web/components/common/MyTooltip'; import dynamic from 'next/dynamic'; import MyBox from '@fastgpt/web/components/common/MyBox'; -import { SearchScoreTypeEnum, SearchScoreTypeMap } from '@fastgpt/global/core/dataset/constants'; +import { + DatasetCollectionTypeEnum, + SearchScoreTypeEnum, + SearchScoreTypeMap +} from '@fastgpt/global/core/dataset/constants'; import type { readCollectionSourceBody } from '@/pages/api/core/dataset/collection/read'; import Markdown from '@/components/Markdown'; @@ -88,11 +92,13 @@ export const formatScore = (score: ScoreItemType[]) => { const QuoteItem = ({ quoteItem, canViewSource, + canEditData, canEditDataset, ...RawSourceBoxProps }: { quoteItem: SearchDataResponseItemType; canViewSource?: boolean; + canEditData?: boolean; canEditDataset?: boolean; } & Omit) => { const { t } = useTranslation(); @@ -206,7 +212,7 @@ const QuoteItem = ({ {...RawSourceBoxProps} /> - {quoteItem.id && canEditDataset && ( + {quoteItem.id && canEditData && ( - {t('chat:to_dataset')} + {t('common:to_dataset')} )} diff --git a/projects/app/src/components/core/dataset/RawSourceBox.tsx b/projects/app/src/components/core/dataset/RawSourceBox.tsx index 0db958a33..2455d7093 100644 --- a/projects/app/src/components/core/dataset/RawSourceBox.tsx +++ b/projects/app/src/components/core/dataset/RawSourceBox.tsx @@ -3,20 +3,22 @@ import { Box, type BoxProps } from '@chakra-ui/react'; import MyTooltip from '@fastgpt/web/components/common/MyTooltip'; import { useTranslation } from 'next-i18next'; import { getCollectionSourceAndOpen } from '@/web/core/dataset/hooks/readCollectionSource'; -import { getSourceNameIcon } from '@fastgpt/global/core/dataset/utils'; +import { getCollectionIcon } from '@fastgpt/global/core/dataset/utils'; import MyIcon from '@fastgpt/web/components/common/Icon'; import type { readCollectionSourceBody } from '@/pages/api/core/dataset/collection/read'; +import type { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constants'; type Props = BoxProps & readCollectionSourceBody & { + collectionType?: DatasetCollectionTypeEnum; sourceName?: string; - collectionId: string; sourceId?: string; canView?: boolean; }; const RawSourceBox = ({ sourceId, + collectionType, sourceName = '', canView = true, @@ -35,7 +37,10 @@ const RawSourceBox = ({ const canPreview = !!sourceId && canView; - const icon = useMemo(() => getSourceNameIcon({ sourceId, sourceName }), [sourceId, sourceName]); + const icon = useMemo( + () => getCollectionIcon({ type: collectionType, sourceId, name: sourceName }), + [collectionType, sourceId, sourceName] + ); const read = getCollectionSourceAndOpen({ collectionId, appId, diff --git a/projects/app/src/global/core/dataset/type.d.ts b/projects/app/src/global/core/dataset/type.d.ts index 41cd315e8..ddca53cb0 100644 --- a/projects/app/src/global/core/dataset/type.d.ts +++ b/projects/app/src/global/core/dataset/type.d.ts @@ -34,9 +34,11 @@ export type DatasetDataListItemType = { _id: string; datasetId: string; collectionId: string; - q: string; // embedding content - a: string; // bonus content + q?: string; + a?: string; + imageId?: string; + imageSize?: number; + imagePreviewUrl?: string; //image preview url chunkIndex?: number; updated?: boolean; - // indexes: DatasetDataSchemaType['indexes']; }; diff --git a/projects/app/src/pageComponents/dataset/detail/CollectionCard/BackupImportModal.tsx b/projects/app/src/pageComponents/dataset/detail/CollectionCard/BackupImportModal.tsx index 267ff3b2c..2d10aca82 100644 --- a/projects/app/src/pageComponents/dataset/detail/CollectionCard/BackupImportModal.tsx +++ b/projects/app/src/pageComponents/dataset/detail/CollectionCard/BackupImportModal.tsx @@ -50,7 +50,7 @@ const BackupImportModal = ({ maxCount={1} fileType="csv" selectFiles={selectFiles} - setSelectFiles={setSelectFiles} + setSelectFiles={(e) => setSelectFiles(e)} /> {/* File render */} {selectFiles.length > 0 && ( diff --git a/projects/app/src/pageComponents/dataset/detail/CollectionCard/Header.tsx b/projects/app/src/pageComponents/dataset/detail/CollectionCard/Header.tsx index 6f5c857e0..19e73bf41 100644 --- a/projects/app/src/pageComponents/dataset/detail/CollectionCard/Header.tsx +++ b/projects/app/src/pageComponents/dataset/detail/CollectionCard/Header.tsx @@ -248,6 +248,26 @@ const Header = ({ hasTrainingData }: { hasTrainingData: boolean }) => { }); } }, + ...(feConfigs?.isPlus + ? [ + { + label: ( + + + {t('dataset:core.dataset.Image collection')} + + ), + onClick: () => + router.replace({ + query: { + ...router.query, + currentTab: TabEnum.import, + source: ImportDataSourceEnum.imageDataset + } + }) + } + ] + : []), { label: ( @@ -473,7 +493,10 @@ const Header = ({ hasTrainingData }: { hasTrainingData: boolean }) => { name={editFolderData.name} /> )} - + {isOpenFileSourceSelector && } {isOpenBackupImportModal && ( collectionsList.map((item) => { const collection = item.data; - const icon = getCollectionIcon(collection.type, collection.name); + const icon = getCollectionIcon({ type: collection.type, name: collection.name }); return { id: collection._id, tags: collection.tags, diff --git a/projects/app/src/pageComponents/dataset/detail/CollectionCard/TrainingStates.tsx b/projects/app/src/pageComponents/dataset/detail/CollectionCard/TrainingStates.tsx index 03f8eeae0..ae6792609 100644 --- a/projects/app/src/pageComponents/dataset/detail/CollectionCard/TrainingStates.tsx +++ b/projects/app/src/pageComponents/dataset/detail/CollectionCard/TrainingStates.tsx @@ -35,6 +35,8 @@ import { useForm } from 'react-hook-form'; import type { getTrainingDetailResponse } from '@/pages/api/core/dataset/collection/trainingDetail'; import { useScrollPagination } from '@fastgpt/web/hooks/useScrollPagination'; import EmptyTip from '@fastgpt/web/components/common/EmptyTip'; +import MyImage from '@/components/MyImage'; +import FormLabel from '@fastgpt/web/components/common/MyBox/FormLabel'; enum TrainingStatus { NotStart = 'NotStart', @@ -48,6 +50,8 @@ const ProgressView = ({ trainingDetail }: { trainingDetail: getTrainingDetailRes const { t } = useTranslation(); const isQA = trainingDetail?.trainingType === DatasetCollectionDataProcessModeEnum.qa; + const isImageParse = + trainingDetail?.trainingType === DatasetCollectionDataProcessModeEnum.imageParse; /* 状态计算 @@ -102,6 +106,18 @@ const ProgressView = ({ trainingDetail }: { trainingDetail: getTrainingDetailRes status: TrainingStatus.Ready, errorCount: 0 }, + ...(isImageParse + ? [ + { + errorCount: trainingDetail.errorCounts.imageParse, + label: t(TrainingProcess.parseImage.label), + statusText: getStatusText(TrainingModeEnum.imageParse), + status: getTrainingStatus({ + errorCount: trainingDetail.errorCounts.imageParse + }) + } + ] + : []), ...(isQA ? [ { @@ -114,7 +130,7 @@ const ProgressView = ({ trainingDetail }: { trainingDetail: getTrainingDetailRes } ] : []), - ...(trainingDetail?.advancedTraining.imageIndex && !isQA + ...(trainingDetail?.advancedTraining.imageIndex ? [ { errorCount: trainingDetail.errorCounts.image, @@ -126,7 +142,7 @@ const ProgressView = ({ trainingDetail }: { trainingDetail: getTrainingDetailRes } ] : []), - ...(trainingDetail?.advancedTraining.autoIndexes && !isQA + ...(trainingDetail?.advancedTraining.autoIndexes ? [ { errorCount: trainingDetail.errorCounts.auto, @@ -159,7 +175,17 @@ const ProgressView = ({ trainingDetail }: { trainingDetail: getTrainingDetailRes ]; return states; - }, [trainingDetail, t, isQA]); + }, [ + trainingDetail.queuedCounts, + trainingDetail.trainingCounts, + trainingDetail.errorCounts, + trainingDetail?.advancedTraining.imageIndex, + trainingDetail?.advancedTraining.autoIndexes, + trainingDetail.trainedCount, + t, + isImageParse, + isQA + ]); return ( @@ -254,11 +280,20 @@ const ProgressView = ({ trainingDetail }: { trainingDetail: getTrainingDetailRes ); }; -const ErrorView = ({ datasetId, collectionId }: { datasetId: string; collectionId: string }) => { +const ErrorView = ({ + datasetId, + collectionId, + refreshTrainingDetail +}: { + datasetId: string; + collectionId: string; + refreshTrainingDetail: () => void; +}) => { const { t } = useTranslation(); const TrainingText = { [TrainingModeEnum.chunk]: t('dataset:process.Vectorizing'), [TrainingModeEnum.qa]: t('dataset:process.Get QA'), + [TrainingModeEnum.imageParse]: t('dataset:process.Image_Index'), [TrainingModeEnum.image]: t('dataset:process.Image_Index'), [TrainingModeEnum.auto]: t('dataset:process.Auto_Index') }; @@ -308,6 +343,7 @@ const ErrorView = ({ datasetId, collectionId }: { datasetId: string; collectionI manual: true, onSuccess: () => { refreshList(); + refreshTrainingDetail(); setEditChunk(undefined); } } @@ -316,6 +352,7 @@ const ErrorView = ({ datasetId, collectionId }: { datasetId: string; collectionI if (editChunk) { return ( setEditChunk(undefined)} onSave={(data) => { @@ -401,10 +438,12 @@ const ErrorView = ({ datasetId, collectionId }: { datasetId: string; collectionI }; const EditView = ({ + loading, editChunk, onCancel, onSave }: { + loading: boolean; editChunk: getTrainingDataDetailResponse; onCancel: () => void; onSave: (data: { q: string; a?: string }) => void; @@ -419,20 +458,41 @@ const EditView = ({ return ( - {editChunk?.a && q} - + {editChunk?.imagePreviewUrl && ( + + {t('file:image')} + + + + + )} + + + {(editChunk?.a || editChunk?.imagePreviewUrl) && ( + + {editChunk?.a + ? t('common:dataset_data_input_chunk_content') + : t('common:dataset_data_input_q')} + + )} + + + {editChunk?.a && ( - <> - a + + {t('common:dataset_data_input_a')} - + )} - @@ -453,14 +513,15 @@ const TrainingStates = ({ const { t } = useTranslation(); const [tab, setTab] = useState(defaultTab); - const { data: trainingDetail, loading } = useRequest2( - () => getDatasetCollectionTrainingDetail(collectionId), - { - pollingInterval: 5000, - pollingWhenHidden: false, - manual: false - } - ); + const { + data: trainingDetail, + loading, + runAsync: refreshTrainingDetail + } = useRequest2(() => getDatasetCollectionTrainingDetail(collectionId), { + pollingInterval: 5000, + pollingWhenHidden: false, + manual: false + }); const errorCounts = (Object.values(trainingDetail?.errorCounts || {}) as number[]).reduce( (acc, count) => acc + count, @@ -493,7 +554,13 @@ const TrainingStates = ({ ]} /> {tab === 'states' && trainingDetail && } - {tab === 'errors' && } + {tab === 'errors' && ( + + )} ); diff --git a/projects/app/src/pageComponents/dataset/detail/CollectionCard/index.tsx b/projects/app/src/pageComponents/dataset/detail/CollectionCard/index.tsx index 33a7de218..5aeeec4eb 100644 --- a/projects/app/src/pageComponents/dataset/detail/CollectionCard/index.tsx +++ b/projects/app/src/pageComponents/dataset/detail/CollectionCard/index.tsx @@ -75,7 +75,7 @@ const CollectionCard = () => { const formatCollections = useMemo( () => collections.map((collection) => { - const icon = getCollectionIcon(collection.type, collection.name); + const icon = getCollectionIcon({ type: collection.type, name: collection.name }); const status = (() => { if (collection.hasError) { return { diff --git a/projects/app/src/pageComponents/dataset/detail/DataCard.tsx b/projects/app/src/pageComponents/dataset/detail/DataCard.tsx index 9e54d1493..28e94c8c5 100644 --- a/projects/app/src/pageComponents/dataset/detail/DataCard.tsx +++ b/projects/app/src/pageComponents/dataset/detail/DataCard.tsx @@ -1,5 +1,5 @@ import React, { useState, useMemo } from 'react'; -import { Box, Card, IconButton, Flex, Button, useTheme } from '@chakra-ui/react'; +import { Box, Card, IconButton, Flex, Button, useTheme, Image } from '@chakra-ui/react'; import { getDatasetDataList, delOneDatasetDataById, @@ -24,28 +24,36 @@ import TagsPopOver from './CollectionCard/TagsPopOver'; import { useSystemStore } from '@/web/common/system/useSystemStore'; import MyDivider from '@fastgpt/web/components/common/MyDivider'; import Markdown from '@/components/Markdown'; -import { useMemoizedFn } from 'ahooks'; +import { useBoolean, useMemoizedFn } from 'ahooks'; import { useScrollPagination } from '@fastgpt/web/hooks/useScrollPagination'; import { TabEnum } from './NavBar'; import { - DatasetCollectionDataProcessModeEnum, + DatasetCollectionTypeEnum, ImportDataSourceEnum } from '@fastgpt/global/core/dataset/constants'; import { useRequest2 } from '@fastgpt/web/hooks/useRequest'; import TrainingStates from './CollectionCard/TrainingStates'; import { getTextValidLength } from '@fastgpt/global/common/string/utils'; import PopoverConfirm from '@fastgpt/web/components/common/MyPopover/PopoverConfirm'; +import { formatFileSize } from '@fastgpt/global/common/file/tools'; +import MyImage from '@fastgpt/web/components/common/Image/MyImage'; +import dynamic from 'next/dynamic'; + +const InsertImagesModal = dynamic(() => import('./data/InsertImageModal'), { + ssr: false +}); const DataCard = () => { - const theme = useTheme(); const router = useRouter(); const { isPc } = useSystem(); - const { collectionId = '', datasetId } = router.query as { + const { feConfigs } = useSystemStore(); + + const { collectionId = '' } = router.query as { collectionId: string; datasetId: string; }; const datasetDetail = useContextSelector(DatasetPageContext, (v) => v.datasetDetail); - const { feConfigs } = useSystemStore(); + const datasetId = useContextSelector(DatasetPageContext, (v) => v.datasetId); const { t } = useTranslation(); const [searchText, setSearchText] = useState(''); @@ -78,21 +86,30 @@ const DataCard = () => { const [editDataId, setEditDataId] = useState(); - // get file info - const { data: collection } = useRequest2(() => getDatasetCollectionById(collectionId), { - refreshDeps: [collectionId], - manual: false, - onError: () => { - router.replace({ - query: { - datasetId - } - }); + // Get collection info + const { data: collection, runAsync: reloadCollection } = useRequest2( + () => getDatasetCollectionById(collectionId), + { + refreshDeps: [collectionId], + manual: false, + onError: () => { + router.replace({ + query: { + datasetId + } + }); + } } - }); + ); const canWrite = useMemo(() => datasetDetail.permission.hasWritePer, [datasetDetail]); + const [ + isInsertImagesModalOpen, + { setTrue: openInsertImagesModal, setFalse: closeInsertImagesModal } + ] = useBoolean(); + const isImageCollection = collection?.type === DatasetCollectionTypeEnum.images; + const onDeleteOneData = useMemoizedFn(async (dataId: string) => { try { await delOneDatasetDataById(dataId); @@ -125,6 +142,7 @@ const DataCard = () => { > {collection?._id && ( { {t('dataset:retain_collection')} )} - {canWrite && ( + {canWrite && !isImageCollection && ( )} + {canWrite && isImageCollection && ( + + )} @@ -236,7 +265,7 @@ const DataCard = () => { userSelect={'none'} boxShadow={'none'} bg={index % 2 === 1 ? 'myGray.50' : 'blue.50'} - border={theme.borders.sm} + border={'sm'} position={'relative'} overflow={'hidden'} _hover={{ @@ -282,17 +311,35 @@ const DataCard = () => { {/* Data content */} - - - {!!item.a && ( - <> - - - - )} - + {item.imagePreviewUrl ? ( + + + + + + + + + ) : ( + + + {!!item.a && ( + <> + + + + )} + + )} - {/* Mask */} + {/* Footer */} { py={1} mr={2} > - - {getTextValidLength(item.q + item.a || '')} + {item.imageSize ? ( + <>{formatFileSize(item.imageSize)} + ) : ( + <> + + {getTextValidLength((item?.q || '') + (item?.a || ''))} + + )} {canWrite && ( { collectionId={collection._id} dataId={editDataId} onClose={() => setEditDataId(undefined)} - onSuccess={(data) => { + onSuccess={(data: any) => { if (editDataId === '') { refreshList(); return; @@ -386,9 +439,16 @@ const DataCard = () => { datasetId={datasetId} defaultTab={'errors'} collectionId={errorModalId} - onClose={() => setErrorModalId('')} + onClose={() => { + setErrorModalId(''); + refreshList(); + reloadCollection(); + }} /> )} + {isInsertImagesModalOpen && ( + + )} ); }; diff --git a/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx b/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx index 13d7f3b96..a55ccf4c7 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/Context.tsx @@ -173,6 +173,20 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode { title: t('dataset:import_confirm') } + ], + [ImportDataSourceEnum.imageDataset]: [ + { + title: t('dataset:import_select_file') + }, + { + title: t('dataset:import_param_setting') + }, + { + title: t('dataset:import_data_preview') + }, + { + title: t('dataset:import_confirm') + } ] }; const steps = modeSteps[source]; @@ -238,20 +252,22 @@ const DatasetImportContextProvider = ({ children }: { children: React.ReactNode {/* step */} - - - + {source !== ImportDataSourceEnum.imageDataset && ( + + + + - + )} {children} ); diff --git a/projects/app/src/pageComponents/dataset/detail/Import/components/FileSelector.tsx b/projects/app/src/pageComponents/dataset/detail/Import/components/FileSelector.tsx index a4b640f33..884a1036b 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/components/FileSelector.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/components/FileSelector.tsx @@ -7,15 +7,8 @@ import MyIcon from '@fastgpt/web/components/common/Icon'; import { useTranslation } from 'next-i18next'; import React, { type DragEvent, useCallback, useMemo, useState } from 'react'; import { getNanoid } from '@fastgpt/global/common/string/tools'; -import { useRequest2 } from '@fastgpt/web/hooks/useRequest'; -import { getFileIcon } from '@fastgpt/global/common/file/icon'; import { useSystemStore } from '@/web/common/system/useSystemStore'; -import { uploadFile2DB } from '@/web/common/file/controller'; -import { BucketNameEnum } from '@fastgpt/global/common/file/constants'; import type { ImportSourceItemType } from '@/web/core/dataset/type'; -import { useContextSelector } from 'use-context-selector'; -import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext'; -import { getErrText } from '@fastgpt/global/common/error/utils'; export type SelectFileItemType = { fileId: string; @@ -26,23 +19,18 @@ export type SelectFileItemType = { const FileSelector = ({ fileType, selectFiles, - setSelectFiles, - onStartSelect, - onFinishSelect, + onSelectFiles, ...props }: { fileType: string; selectFiles: ImportSourceItemType[]; - setSelectFiles: React.Dispatch>; - onStartSelect: () => void; - onFinishSelect: () => void; + onSelectFiles: (e: SelectFileItemType[]) => any; } & FlexProps) => { const { t } = useTranslation(); const { toast } = useToast(); const { feConfigs } = useSystemStore(); - const datasetId = useContextSelector(DatasetPageContext, (v) => v.datasetId); const maxCount = feConfigs?.uploadFileMaxAmount || 1000; const maxSize = (feConfigs?.uploadFileMaxSize || 1024) * 1024 * 1024; @@ -65,90 +53,6 @@ const FileSelector = ({ 'i' ); - const { runAsync: onSelectFile, loading: isLoading } = useRequest2( - async (files: SelectFileItemType[]) => { - { - await Promise.all( - files.map(async ({ fileId, file }) => { - try { - const { fileId: uploadFileId } = await uploadFile2DB({ - file, - bucketName: BucketNameEnum.dataset, - data: { - datasetId - }, - percentListen: (e) => { - setSelectFiles((state) => - state.map((item) => - item.id === fileId - ? { - ...item, - uploadedFileRate: item.uploadedFileRate - ? Math.max(e, item.uploadedFileRate) - : e - } - : item - ) - ); - } - }); - setSelectFiles((state) => - state.map((item) => - item.id === fileId - ? { - ...item, - dbFileId: uploadFileId, - isUploading: false, - uploadedFileRate: 100 - } - : item - ) - ); - } catch (error) { - setSelectFiles((state) => - state.map((item) => - item.id === fileId - ? { - ...item, - isUploading: false, - errorMsg: getErrText(error) - } - : item - ) - ); - } - }) - ); - } - }, - { - onBefore([files]) { - onStartSelect(); - setSelectFiles((state) => { - const formatFiles = files.map((selectFile) => { - const { fileId, file } = selectFile; - - return { - id: fileId, - createStatus: 'waiting', - file, - sourceName: file.name, - sourceSize: formatFileSize(file.size), - icon: getFileIcon(file.name), - isUploading: true, - uploadedFileRate: 0 - }; - }); - const results = formatFiles.concat(state).slice(0, maxCount); - return results; - }); - }, - onFinally() { - onFinishSelect(); - } - } - ); - const selectFileCallback = useCallback( (files: SelectFileItemType[]) => { if (selectFiles.length + files.length > maxCount) { @@ -160,7 +64,7 @@ const FileSelector = ({ } // size check if (!maxSize) { - return onSelectFile(files); + return onSelectFiles(files); } const filterFiles = files.filter((item) => item.file.size <= maxSize); @@ -171,9 +75,9 @@ const FileSelector = ({ }); } - return onSelectFile(filterFiles); + return onSelectFiles(filterFiles); }, - [t, maxCount, maxSize, onSelectFile, selectFiles.length, toast] + [t, maxCount, maxSize, onSelectFiles, selectFiles.length, toast] ); const handleDragEnter = (e: DragEvent) => { @@ -278,7 +182,6 @@ const FileSelector = ({ return ( { - {t('common:core.dataset.collection.Collection name')} + {t('dataset:collection_name')} { {...register('name', { required: true })} - placeholder={t('common:core.dataset.collection.Collection name')} + placeholder={t('dataset:collection_name')} bg={'myGray.50'} /> diff --git a/projects/app/src/pageComponents/dataset/detail/Import/diffSource/FileLocal.tsx b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/FileLocal.tsx index 3428a119a..18c60f0c0 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/diffSource/FileLocal.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/FileLocal.tsx @@ -1,14 +1,20 @@ import React, { useCallback, useEffect, useMemo, useState } from 'react'; import { type ImportSourceItemType } from '@/web/core/dataset/type.d'; import { Box, Button } from '@chakra-ui/react'; -import FileSelector from '../components/FileSelector'; +import FileSelector, { type SelectFileItemType } from '../components/FileSelector'; import { useTranslation } from 'next-i18next'; import dynamic from 'next/dynamic'; -import Loading from '@fastgpt/web/components/common/MyLoading'; import { RenderUploadFiles } from '../components/RenderFiles'; import { useContextSelector } from 'use-context-selector'; import { DatasetImportContext } from '../Context'; +import { useRequest2 } from '@fastgpt/web/hooks/useRequest'; +import { uploadFile2DB } from '@/web/common/file/controller'; +import { BucketNameEnum } from '@fastgpt/global/common/file/constants'; +import { getErrText } from '@fastgpt/global/common/error/utils'; +import { formatFileSize } from '@fastgpt/global/common/file/tools'; +import { getFileIcon } from '@fastgpt/global/common/file/icon'; +import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext'; const DataProcess = dynamic(() => import('../commonProgress/DataProcess')); const PreviewData = dynamic(() => import('../commonProgress/PreviewData')); @@ -33,14 +39,16 @@ export default React.memo(FileLocal); const SelectFile = React.memo(function SelectFile() { const { t } = useTranslation(); + const { goToNext, sources, setSources } = useContextSelector(DatasetImportContext, (v) => v); + const datasetId = useContextSelector(DatasetPageContext, (v) => v.datasetId); + const [selectFiles, setSelectFiles] = useState( sources.map((source) => ({ isUploading: false, ...source })) ); - const [uploading, setUploading] = useState(false); const successFiles = useMemo(() => selectFiles.filter((item) => !item.errorMsg), [selectFiles]); useEffect(() => { @@ -53,15 +61,90 @@ const SelectFile = React.memo(function SelectFile() { goToNext(); }, [goToNext]); + const { runAsync: onSelectFiles, loading: uploading } = useRequest2( + async (files: SelectFileItemType[]) => { + { + await Promise.all( + files.map(async ({ fileId, file }) => { + try { + const { fileId: uploadFileId } = await uploadFile2DB({ + file, + bucketName: BucketNameEnum.dataset, + data: { + datasetId + }, + percentListen: (e) => { + setSelectFiles((state) => + state.map((item) => + item.id === fileId + ? { + ...item, + uploadedFileRate: item.uploadedFileRate + ? Math.max(e, item.uploadedFileRate) + : e + } + : item + ) + ); + } + }); + setSelectFiles((state) => + state.map((item) => + item.id === fileId + ? { + ...item, + dbFileId: uploadFileId, + isUploading: false, + uploadedFileRate: 100 + } + : item + ) + ); + } catch (error) { + setSelectFiles((state) => + state.map((item) => + item.id === fileId + ? { + ...item, + isUploading: false, + errorMsg: getErrText(error) + } + : item + ) + ); + } + }) + ); + } + }, + { + onBefore([files]) { + setSelectFiles((state) => { + return [ + ...state, + ...files.map((selectFile) => { + const { fileId, file } = selectFile; + + return { + id: fileId, + createStatus: 'waiting', + file, + sourceName: file.name, + sourceSize: formatFileSize(file.size), + icon: getFileIcon(file.name), + isUploading: true, + uploadedFileRate: 0 + }; + }) + ]; + }); + } + } + ); + return ( - setUploading(true)} - onFinishSelect={() => setUploading(false)} - /> + {/* render files */} diff --git a/projects/app/src/pageComponents/dataset/detail/Import/diffSource/ImageDataset.tsx b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/ImageDataset.tsx new file mode 100644 index 000000000..20d1f82fd --- /dev/null +++ b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/ImageDataset.tsx @@ -0,0 +1,189 @@ +import React, { useState } from 'react'; +import { Box, Button, Flex, Input, Image } from '@chakra-ui/react'; +import { useTranslation } from 'next-i18next'; +import { useRouter } from 'next/router'; +import { TabEnum } from '../../NavBar'; +import { createImageDatasetCollection } from '@/web/core/dataset/image/api'; +import FormLabel from '@fastgpt/web/components/common/MyBox/FormLabel'; +import { useForm } from 'react-hook-form'; +import FileSelector, { type SelectFileItemType } from '../components/FileSelector'; +import type { ImportSourceItemType } from '@/web/core/dataset/type'; +import { getNanoid } from '@fastgpt/global/common/string/tools'; +import MyIcon from '@fastgpt/web/components/common/Icon'; +import { useRequest2 } from '@fastgpt/web/hooks/useRequest'; +import { useContextSelector } from 'use-context-selector'; +import { DatasetPageContext } from '@/web/core/dataset/context/datasetPageContext'; +import { DatasetImportContext } from '../Context'; +import MyImage from '@fastgpt/web/components/common/Image/MyImage'; + +const fileType = '.jpg, .jpeg, .png'; + +const ImageDataset = () => { + return ; +}; + +export default React.memo(ImageDataset); + +const SelectFile = React.memo(function SelectFile() { + const { t } = useTranslation(); + const router = useRouter(); + + const parentId = useContextSelector(DatasetImportContext, (v) => v.parentId); + const datasetId = useContextSelector(DatasetPageContext, (v) => v.datasetId); + + const [selectFiles, setSelectFiles] = useState([]); + const [uploadProgress, setUploadProgress] = useState(0); + + const { register, handleSubmit } = useForm({ + defaultValues: { + name: '' + } + }); + + const onSelectFiles = (files: SelectFileItemType[]) => { + setSelectFiles((pre) => { + const formatFiles = Array.from(files).map((item) => { + const previewUrl = URL.createObjectURL(item.file); + + return { + id: getNanoid(), + createStatus: 'waiting', + file: item.file, + sourceName: item.file.name, + icon: previewUrl + }; + }); + + return [...pre, ...formatFiles]; + }); + }; + const onRemoveFile = (index: number) => { + setSelectFiles((prev) => { + return prev.filter((_, i) => i !== index); + }); + }; + + const { runAsync: onCreate, loading: creating } = useRequest2( + async ({ name: collectionName }: { name: string }) => { + return await createImageDatasetCollection({ + parentId, + datasetId, + collectionName, + files: selectFiles.map((item) => item.file!).filter(Boolean), + onUploadProgress: setUploadProgress + }); + }, + { + manual: true, + successToast: t('common:create_success'), + onSuccess() { + router.replace({ + query: { + datasetId: router.query.datasetId, + currentTab: TabEnum.collectionCard + } + }); + } + } + ); + + return ( + + + + {t('dataset:collection_name')} + + + + + + + + {t('common:core.dataset.collection.Collection raw text')} + + + + + + + {selectFiles.length > 0 && ( + + {selectFiles.map((file, index) => ( + + + onRemoveFile(index)} + className="close-icon" + display={['', 'none']} + zIndex={10} + /> + + ))} + + )} + + + + + + + + ); +}); diff --git a/projects/app/src/pageComponents/dataset/detail/Import/diffSource/ReTraining.tsx b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/ReTraining.tsx index 08771300c..1d3a69682 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/diffSource/ReTraining.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/diffSource/ReTraining.tsx @@ -37,7 +37,7 @@ const ReTraining = () => { apiFileId: collection.apiFileId, createStatus: 'waiting', - icon: getCollectionIcon(collection.type, collection.name), + icon: getCollectionIcon({ type: collection.type, name: collection.name }), id: collection._id, isUploading: false, sourceName: collection.name, diff --git a/projects/app/src/pageComponents/dataset/detail/Import/index.tsx b/projects/app/src/pageComponents/dataset/detail/Import/index.tsx index f9d253ff3..345a360e8 100644 --- a/projects/app/src/pageComponents/dataset/detail/Import/index.tsx +++ b/projects/app/src/pageComponents/dataset/detail/Import/index.tsx @@ -11,6 +11,7 @@ const FileCustomText = dynamic(() => import('./diffSource/FileCustomText')); const ExternalFileCollection = dynamic(() => import('./diffSource/ExternalFile')); const APIDatasetCollection = dynamic(() => import('./diffSource/APIDataset')); const ReTraining = dynamic(() => import('./diffSource/ReTraining')); +const ImageDataset = dynamic(() => import('./diffSource/ImageDataset')); const ImportDataset = () => { const importSource = useContextSelector(DatasetImportContext, (v) => v.importSource); @@ -22,6 +23,8 @@ const ImportDataset = () => { if (importSource === ImportDataSourceEnum.fileCustom) return FileCustomText; if (importSource === ImportDataSourceEnum.externalFile) return ExternalFileCollection; if (importSource === ImportDataSourceEnum.apiDataset) return APIDatasetCollection; + if (importSource === ImportDataSourceEnum.imageDataset) return ImageDataset; + return null; }, [importSource]); return ImportComponent ? ( diff --git a/projects/app/src/pageComponents/dataset/detail/InputDataModal.tsx b/projects/app/src/pageComponents/dataset/detail/InputDataModal.tsx index b6bf568f4..567a06d67 100644 --- a/projects/app/src/pageComponents/dataset/detail/InputDataModal.tsx +++ b/projects/app/src/pageComponents/dataset/detail/InputDataModal.tsx @@ -1,37 +1,39 @@ import React, { useCallback, useEffect, useMemo, useRef, useState } from 'react'; -import { Box, Flex, Button, Textarea, ModalFooter, HStack, VStack } from '@chakra-ui/react'; -import { type UseFormRegister, useFieldArray, useForm } from 'react-hook-form'; +import { Box, Flex, Button, Textarea, ModalFooter, HStack, VStack, Image } from '@chakra-ui/react'; +import type { UseFormRegister } from 'react-hook-form'; +import { useFieldArray, useForm } from 'react-hook-form'; import { postInsertData2Dataset, putDatasetDataById, getDatasetCollectionById, getDatasetDataItemById } from '@/web/core/dataset/api'; -import { useToast } from '@fastgpt/web/hooks/useToast'; import MyIcon from '@fastgpt/web/components/common/Icon'; import MyModal from '@fastgpt/web/components/common/MyModal'; import MyTooltip from '@fastgpt/web/components/common/MyTooltip'; import { useTranslation } from 'next-i18next'; import { useRequest2 } from '@fastgpt/web/hooks/useRequest'; -import { getSourceNameIcon } from '@fastgpt/global/core/dataset/utils'; -import { type DatasetDataIndexItemType } from '@fastgpt/global/core/dataset/type'; +import { getCollectionIcon } from '@fastgpt/global/core/dataset/utils'; +import type { DatasetDataIndexItemType } from '@fastgpt/global/core/dataset/type'; import DeleteIcon from '@fastgpt/web/components/common/Icon/delete'; import { defaultCollectionDetail } from '@/web/core/dataset/constants'; import MyBox from '@fastgpt/web/components/common/MyBox'; -import { getErrText } from '@fastgpt/global/common/error/utils'; import { useSystemStore } from '@/web/common/system/useSystemStore'; import styles from './styles.module.scss'; import { DatasetDataIndexTypeEnum, getDatasetIndexMapData } from '@fastgpt/global/core/dataset/data/constants'; +import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constants'; import FillRowTabs from '@fastgpt/web/components/common/Tabs/FillRowTabs'; import FormLabel from '@fastgpt/web/components/common/MyBox/FormLabel'; import MyIconButton from '@fastgpt/web/components/common/Icon/button'; +import MyImage from '@/components/MyImage/index'; export type InputDataType = { q: string; a: string; + imagePreivewUrl?: string; indexes: (Omit & { dataId?: string; // pg data id fold: boolean; @@ -40,7 +42,8 @@ export type InputDataType = { enum TabEnum { chunk = 'chunk', - qa = 'qa' + qa = 'qa', + image = 'image' } const InputDataModal = ({ @@ -52,17 +55,16 @@ const InputDataModal = ({ }: { collectionId: string; dataId?: string; - defaultValue?: { q: string; a?: string }; + defaultValue?: { q?: string; a?: string; imagePreivewUrl?: string }; onClose: () => void; onSuccess: (data: InputDataType & { dataId: string }) => void; }) => { const { t } = useTranslation(); - const { toast } = useToast(); const { embeddingModelList, defaultModels } = useSystemStore(); - const [currentTab, setCurrentTab] = useState(TabEnum.chunk); + const [currentTab, setCurrentTab] = useState(); - const { register, handleSubmit, reset, control } = useForm(); + const { register, handleSubmit, reset, control, watch } = useForm(); const { fields: indexes, prepend: prependIndexes, @@ -72,16 +74,24 @@ const InputDataModal = ({ control, name: 'indexes' }); + const imagePreivewUrl = watch('imagePreivewUrl'); const { data: collection = defaultCollectionDetail } = useRequest2( - () => { - return getDatasetCollectionById(collectionId); - }, + () => getDatasetCollectionById(collectionId), { manual: false, - refreshDeps: [collectionId] + refreshDeps: [collectionId], + onSuccess(res) { + if (res.type === DatasetCollectionTypeEnum.images) { + setCurrentTab(TabEnum.image); + } else { + setCurrentTab(TabEnum.chunk); + } + } } ); + + // Get data const { loading: isFetchingData } = useRequest2( async () => { if (dataId) return getDatasetDataItemById(dataId); @@ -93,8 +103,9 @@ const InputDataModal = ({ onSuccess(res) { if (res) { reset({ - q: res.q, - a: res.a, + q: res.q || '', + a: res.a || '', + imagePreivewUrl: res.imagePreivewUrl, indexes: res.indexes.map((item) => ({ ...item, fold: true @@ -102,54 +113,32 @@ const InputDataModal = ({ }); } else if (defaultValue) { reset({ - q: defaultValue.q, - a: defaultValue.a + q: defaultValue.q || '', + a: defaultValue.a || '', + imagePreivewUrl: defaultValue.imagePreivewUrl }); } - - if (res?.a || defaultValue?.a) { - setCurrentTab(TabEnum.qa); - } }, onError(err) { - toast({ - status: 'error', - title: t(getErrText(err) as any) - }); onClose(); } } ); - const maxToken = useMemo(() => { - const vectorModel = - embeddingModelList.find((item) => item.model === collection.dataset.vectorModel) || - defaultModels.embedding; - - return vectorModel?.maxToken || 3000; - }, [collection.dataset.vectorModel, defaultModels.embedding, embeddingModelList]); - - // import new data + // Import new data const { runAsync: sureImportData, loading: isImporting } = useRequest2( async (e: InputDataType) => { - if (!e.q) { - return Promise.reject(t('common:dataset.data.input is empty')); - } - - const totalLength = e.q.length + (e.a?.length || 0); - if (totalLength >= maxToken * 1.4) { - return Promise.reject(t('common:core.dataset.data.Too Long')); - } - const data = { ...e }; - const dataId = await postInsertData2Dataset({ + const postData: any = { collectionId: collection._id, q: e.q, a: currentTab === TabEnum.qa ? e.a : '', // Contains no default index - indexes: e.indexes?.filter((item) => !!item.text?.trim()) - }); + indexes: e.indexes.filter((item) => !!item.text?.trim()) + }; + + const dataId = await postInsertData2Dataset(postData); return { ...data, @@ -166,23 +155,26 @@ const InputDataModal = ({ a: '', indexes: [] }); + onSuccess(e); }, - errorToast: t('common:error.unKnow') + errorToast: t('dataset:common.error.unKnow') } ); - // update + // Update data const { runAsync: onUpdateData, loading: isUpdating } = useRequest2( async (e: InputDataType) => { if (!dataId) return Promise.reject(t('common:error.unKnow')); - await putDatasetDataById({ + const updateData: any = { dataId, q: e.q, a: currentTab === TabEnum.qa ? e.a : '', indexes: e.indexes.filter((item) => !!item.text?.trim()) - }); + }; + + await putDatasetDataById(updateData); return { dataId, @@ -202,10 +194,18 @@ const InputDataModal = ({ const isLoading = isFetchingData; const icon = useMemo( - () => getSourceNameIcon({ sourceName: collection.sourceName, sourceId: collection.sourceId }), + () => getCollectionIcon({ type: collection.type, name: collection.sourceName }), [collection] ); + const maxToken = useMemo(() => { + const vectorModel = + embeddingModelList.find((item) => item.model === collection.dataset.vectorModel) || + defaultModels.embedding; + + return vectorModel?.maxToken || 2000; + }, [collection.dataset.vectorModel, defaultModels.embedding, embeddingModelList]); + return ( {/* Tab */} - { - setCurrentTab(e); - }} - /> + {(currentTab === TabEnum.chunk || currentTab === TabEnum.qa) && ( + { + setCurrentTab(e); + }} + /> + )} @@ -268,45 +270,64 @@ const InputDataModal = ({ w={['100%', 0]} overflow={['unset', 'auto']} > - - - {currentTab === TabEnum.chunk - ? t('common:dataset_data_input_chunk_content') - : t('common:dataset_data_input_q')} - -