diff --git a/deploy/docker/docker-compose-milvus.yml b/deploy/docker/docker-compose-milvus.yml index 5fb04c883..952970303 100644 --- a/deploy/docker/docker-compose-milvus.yml +++ b/deploy/docker/docker-compose-milvus.yml @@ -114,15 +114,15 @@ services: # fastgpt sandbox: container_name: sandbox - image: ghcr.io/labring/fastgpt-sandbox:v4.9.0 # git - # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.0 # 阿里云 + image: ghcr.io/labring/fastgpt-sandbox:v4.9.1 # git + # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.1 # 阿里云 networks: - fastgpt restart: always fastgpt: container_name: fastgpt - image: ghcr.io/labring/fastgpt:v4.9.0 # git - # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.0 # 阿里云 + image: ghcr.io/labring/fastgpt:v4.9.1 # git + # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.1 # 阿里云 ports: - 3000:3000 networks: @@ -175,7 +175,8 @@ services: # AI Proxy aiproxy: - image: 'ghcr.io/labring/aiproxy:latest' + image: ghcr.io/labring/aiproxy:v0.1.3 + # image: registry.cn-hangzhou.aliyuncs.com/labring/aiproxy:v0.1.3 # 阿里云 container_name: aiproxy restart: unless-stopped depends_on: diff --git a/deploy/docker/docker-compose-pgvector.yml b/deploy/docker/docker-compose-pgvector.yml index 5ce83bfc1..ea22bd6b5 100644 --- a/deploy/docker/docker-compose-pgvector.yml +++ b/deploy/docker/docker-compose-pgvector.yml @@ -72,15 +72,15 @@ services: # fastgpt sandbox: container_name: sandbox - image: ghcr.io/labring/fastgpt-sandbox:v4.9.0 # git - # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.0 # 阿里云 + image: ghcr.io/labring/fastgpt-sandbox:v4.9.1 # git + # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.1 # 阿里云 networks: - fastgpt restart: always fastgpt: container_name: fastgpt - image: ghcr.io/labring/fastgpt:v4.9.0 # git - # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.0 # 阿里云 + image: ghcr.io/labring/fastgpt:v4.9.1 # git + # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.1 # 阿里云 ports: - 3000:3000 networks: @@ -132,7 +132,8 @@ services: # AI Proxy aiproxy: - image: 'ghcr.io/labring/aiproxy:latest' + image: ghcr.io/labring/aiproxy:v0.1.3 + # image: registry.cn-hangzhou.aliyuncs.com/labring/aiproxy:v0.1.3 # 阿里云 container_name: aiproxy restart: unless-stopped depends_on: diff --git a/deploy/docker/docker-compose-zilliz.yml b/deploy/docker/docker-compose-zilliz.yml index a63fb6c30..292289baf 100644 --- a/deploy/docker/docker-compose-zilliz.yml +++ b/deploy/docker/docker-compose-zilliz.yml @@ -53,15 +53,15 @@ services: wait $$! sandbox: container_name: sandbox - image: ghcr.io/labring/fastgpt-sandbox:v4.9.0 # git - # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.0 # 阿里云 + image: ghcr.io/labring/fastgpt-sandbox:v4.9.1 # git + # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt-sandbox:v4.9.1 # 阿里云 networks: - fastgpt restart: always fastgpt: container_name: fastgpt - image: ghcr.io/labring/fastgpt:v4.9.0 # git - # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.0 # 阿里云 + image: ghcr.io/labring/fastgpt:v4.9.1 # git + # image: registry.cn-hangzhou.aliyuncs.com/fastgpt/fastgpt:v4.9.1 # 阿里云 ports: - 3000:3000 networks: @@ -113,7 +113,8 @@ services: # AI Proxy aiproxy: - image: 'ghcr.io/labring/aiproxy:latest' + image: ghcr.io/labring/aiproxy:v0.1.3 + # image: registry.cn-hangzhou.aliyuncs.com/labring/aiproxy:v0.1.3 # 阿里云 container_name: aiproxy restart: unless-stopped depends_on: diff --git a/docSite/content/zh-cn/docs/development/upgrading/491.md b/docSite/content/zh-cn/docs/development/upgrading/491.md index 4d4c40b9f..8f489a4b7 100644 --- a/docSite/content/zh-cn/docs/development/upgrading/491.md +++ b/docSite/content/zh-cn/docs/development/upgrading/491.md @@ -11,7 +11,12 @@ weight: 799 ### 1. 做好数据库备份 -### 2. 更新镜像和 PG 容器 +### 2. 更新镜像 + +- 更新 FastGPT 镜像 tag: v4.9.1 +- 更新 FastGPT 商业版镜像 tag: v4.9.1 +- Sandbox 镜像,可以不更新 +- AIProxy 镜像修改为: registry.cn-hangzhou.aliyuncs.com/labring/aiproxy:v0.1.3 ### 3. 执行升级脚本 @@ -25,7 +30,7 @@ curl --location --request POST 'https://{{host}}/api/admin/initv491' \ **脚本功能** -重新使用最新的 jieba 分词库进行分词处理。 +重新使用最新的 jieba 分词库进行分词处理。时间较长,可以从日志里查看进度。 ## 🚀 新增内容 diff --git a/packages/service/core/dataset/data/schema.ts b/packages/service/core/dataset/data/schema.ts index 3457d2f12..92f2acb31 100644 --- a/packages/service/core/dataset/data/schema.ts +++ b/packages/service/core/dataset/data/schema.ts @@ -106,7 +106,7 @@ try { DatasetDataSchema.index({ rebuilding: 1, teamId: 1, datasetId: 1 }); // 为查询 initJieba 字段不存在的数据添加索引 - DatasetDataSchema.index({ initJieba: 1 }, { sparse: true }); + DatasetDataSchema.index({ initJieba: 1, updateTime: 1 }); } catch (error) { console.log(error); } diff --git a/projects/app/src/pages/api/admin/initv491.ts b/projects/app/src/pages/api/admin/initv491.ts index ad391838f..07c431655 100644 --- a/projects/app/src/pages/api/admin/initv491.ts +++ b/projects/app/src/pages/api/admin/initv491.ts @@ -7,44 +7,64 @@ import { addLog } from '@fastgpt/service/common/system/log'; import { delay } from '@fastgpt/global/common/system/utils'; import { MongoDatasetDataText } from '@fastgpt/service/core/dataset/data/dataTextSchema'; import { mongoSessionRun } from '@fastgpt/service/common/mongo/sessionRun'; +import { DatasetDataTextSchemaType } from '@fastgpt/global/core/dataset/type'; +import type { AnyBulkWriteOperation } from '@fastgpt/service/common/mongo'; const updateData = async () => { let success = 0; + while (true) { try { - const data = await MongoDatasetData.find({ initJieba: { $exists: false } }).limit(100); + const time = Date.now(); + const data = await MongoDatasetData.find({ + initJieba: { $exists: false }, + updateTime: { $lte: time } // 只需要取旧的数据 + }) + .limit(1000) + .lean(); if (data.length === 0) { console.log('更新分词完成'); break; } + console.log('读取数据完成', Date.now() - time); + const dataTextOps: AnyBulkWriteOperation[] = []; + const datasetDataIds: string[] = []; - await Promise.allSettled( - data.map(async (item) => { - const text = `${item.q} ${item.a}`.trim(); + // 先进行分词处理 + for await (const item of data) { + const text = `${item.q} ${item.a}`.trim(); + try { + const tokens = await jiebaSplit({ text }); + dataTextOps.push({ + updateOne: { + filter: { dataId: item._id }, + update: { $set: { fullTextToken: tokens } } + } + }); + datasetDataIds.push(item._id); + } catch (error) { + console.log(`分词处理错误: ${item._id}`, error); + } + } + console.log('分词处理完成', Date.now() - time); - try { - await mongoSessionRun(async (session) => { - await MongoDatasetDataText.updateOne( - { - dataId: item._id - }, - { - fullTextToken: await jiebaSplit({ text }) - }, - { - session - } - ); - // @ts-ignore - item.initJieba = true; - await item.save({ session }); - }); - } catch (error) { - console.log(error); - } - }) - ); - success += data.length; + await mongoSessionRun(async (session) => { + if (dataTextOps.length > 0) { + await MongoDatasetDataText.bulkWrite(dataTextOps, { session, ordered: true }); + } + if (datasetDataIds.length > 0) { + await MongoDatasetData.updateMany( + { _id: { $in: datasetDataIds } }, + { $set: { initJieba: true } }, + { + session + } + ); + } + }); + console.log('保存完成', Date.now() - time); + + success += dataTextOps.length; console.log(`成功 ${success}`); } catch (error) { addLog.error('更新所有旧的 jieba 分词失败', error);