update batch files in update index

2026-01-18 09:12:52 +00:00 · 2025-07-10 12:54:57 +08:00 · 2025-07-10 12:54:57 +08:00 · c1fbd4da21
commit c1fbd4da21
parent 21f4734917
1 changed files with 419 additions and 366 deletions
--- a/src/database/modules/vector/vector-manager.ts
+++ b/src/database/modules/vector/vector-manager.ts
@ -30,7 +30,7 @@ export class VectorManager {
 	constructor(app: App, dbManager: DBManager) {
 		this.app = app
 		this.dbManager = dbManager
-		this.repository = new VectorRepository(app, dbManager.getPgClient())
+		this.repository = new VectorRepository(app, dbManager.getPgClient() as any)
 	}
 	async performSimilaritySearch(
@ -103,13 +103,25 @@ export class VectorManager {
 	// 强制垃圾回收的辅助方法
 	private forceGarbageCollection() {
 		try {
-			if (typeof global !== 'undefined' && global.gc) {
+			// 强制垃圾回收多次，确保释放资源
-				global.gc()
+			for (let i = 0; i < 3; i++) {
 				if (typeof global !== 'undefined' && (global as any).gc) {
 					(global as any).gc()
 				} else if (typeof window !== 'undefined' && (window as any).gc) {
-				((window as any).gc as () => void)();
+					(window as any).gc()
 				}
 			}
 			// 强制清理一些可能的引用
 			if (typeof global !== 'undefined' && (global as any).gc) {
 				// Node.js 环境
 				setTimeout(() => {
 					(global as any).gc?.()
 				}, 0)
 			}
 		} catch (e) {
 			// 忽略垃圾回收错误
 			console.debug('GC error (ignored):', e)
 		}
 	}
@ -188,9 +200,45 @@ export class VectorManager {
 		console.log("textSplitter chunkSize: ", options.chunkSize, "overlap: ", overlap)
 		const skippedFiles: string[] = []
-		const contentChunks: InsertVector[] = (
+		const embeddingProgress = { completed: 0, totalChunks: 0 }
 		// 分批处理文件，每批最多50个文件（减少以避免文件句柄耗尽）
 		const FILE_BATCH_SIZE = 50
 		// 减少批量大小以降低内存压力
 		const embeddingBatchSize = Math.min(options.batchSize, 10)
 		// 首先统计总的分块数量用于进度显示
 		let totalChunks = 0
 		for (let i = 0; i < filesToIndex.length; i += FILE_BATCH_SIZE) {
 			const fileBatch = filesToIndex.slice(i, Math.min(i + FILE_BATCH_SIZE, filesToIndex.length))
 			for (const file of fileBatch) {
 				try {
 					let fileContent = await this.app.vault.cachedRead(file)
 					fileContent = fileContent.replace(/\0/g, '')
 					const fileDocuments = await textSplitter.createDocuments([fileContent])
 					totalChunks += fileDocuments.length
 				} catch (error) {
 					// 统计阶段跳过错误文件
 				}
 			}
 		}
 		embeddingProgress.totalChunks = totalChunks
 		updateProgress?.({
 			completedChunks: 0,
 			totalChunks: totalChunks,
 			totalFiles: filesToIndex.length,
 		})
 		try {
 			for (let i = 0; i < filesToIndex.length; i += FILE_BATCH_SIZE) {
 				const fileBatch = filesToIndex.slice(i, Math.min(i + FILE_BATCH_SIZE, filesToIndex.length))
 				console.log(`Processing file batch ${Math.floor(i / FILE_BATCH_SIZE) + 1}/${Math.ceil(filesToIndex.length / FILE_BATCH_SIZE)} (${fileBatch.length} files)`)
 				// 第一步：分块处理
 				const batchChunks = (
 					await Promise.all(
-				filesToIndex.map(async (file) => {
+						fileBatch.map(async (file) => {
 							try {
 								let fileContent = await this.app.vault.cachedRead(file)
 								// 清理null字节，防止PostgreSQL UTF8编码错误
@ -226,37 +274,23 @@ export class VectorManager {
 					)
 				).flat()
-		console.log("contentChunks: ", contentChunks.length)
+				if (batchChunks.length === 0) {
-
+					continue
 		if (skippedFiles.length > 0) {
 			console.warn(`跳过了 ${skippedFiles.length} 个有问题的文件:`, skippedFiles)
 			new Notice(`跳过了 ${skippedFiles.length} 个有问题的文件`)
 				}
-		updateProgress?.({
+				// 第二步：嵌入处理
-			completedChunks: 0,
+				console.log(`Embedding ${batchChunks.length} chunks for current file batch`)
 			totalChunks: contentChunks.length,
 			totalFiles: filesToIndex.length,
 		})
 		const embeddingProgress = { completed: 0 }
 		// 减少批量大小以降低内存压力
 		const batchSize = Math.min(options.batchSize, 20) // 限制最大批量大小
 		let batchCount = 0
 		try {
 				if (embeddingModel.supportsBatch) {
-				// 支持批量处理的提供商：使用流式处理逻辑
+					// 支持批量处理的提供商
-				for (let i = 0; i < contentChunks.length; i += batchSize) {
+					for (let j = 0; j < batchChunks.length; j += embeddingBatchSize) {
-					batchCount++
+						const embeddingBatch = batchChunks.slice(j, Math.min(j + embeddingBatchSize, batchChunks.length))
 					const batchChunks = contentChunks.slice(i, Math.min(i + batchSize, contentChunks.length))
 						const embeddedBatch: InsertVector[] = []
 						await backOff(
 							async () => {
 								// 在嵌入之前处理 markdown
-							const cleanedBatchData = batchChunks.map(chunk => {
+								const cleanedBatchData = embeddingBatch.map(chunk => {
 									const cleanContent = removeMarkdown(chunk.content)
 									return { chunk, cleanContent }
 								}).filter(({ cleanContent }) => cleanContent && cleanContent.trim().length > 0)
@ -269,64 +303,49 @@ export class VectorManager {
 								const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts)
 								// 合并embedding结果到chunk数据
-							for (let j = 0; j < cleanedBatchData.length; j++) {
+								for (let k = 0; k < cleanedBatchData.length; k++) {
-								const { chunk, cleanContent } = cleanedBatchData[j]
+									const { chunk, cleanContent } = cleanedBatchData[k]
 									const embeddedChunk: InsertVector = {
 										path: chunk.path,
 										mtime: chunk.mtime,
 										content: cleanContent, // 使用已经清理过的内容
-									embedding: batchEmbeddings[j],
+										embedding: batchEmbeddings[k],
 										metadata: chunk.metadata,
 									}
 									embeddedBatch.push(embeddedChunk)
 								}
 							},
 							{
-							numOfAttempts: 3, // 减少重试次数
+								numOfAttempts: 3,
-							startingDelay: 500, // 减少延迟
+								startingDelay: 500,
 								timeMultiple: 1.5,
 								jitter: 'full',
 							},
 						)
-					// 使用事务批量插入，减少数据库连接压力
+						// 第三步：立即存储
 						if (embeddedBatch.length > 0) {
 							await this.insertVectorsWithTransaction(embeddedBatch, embeddingModel)
-						console.log("insert vectors with transaction success, batch size: ", embeddedBatch.length)
+							console.log(`Stored ${embeddedBatch.length} embedded chunks`)
 						// 清理批次数据
 						embeddedBatch.length = 0
 						}
-					embeddingProgress.completed += batchChunks.length
+						embeddingProgress.completed += embeddingBatch.length
 						updateProgress?.({
 							completedChunks: embeddingProgress.completed,
-						totalChunks: contentChunks.length,
+							totalChunks: embeddingProgress.totalChunks,
 							totalFiles: filesToIndex.length,
 						})
 					// 定期内存清理和延迟
 					await this.memoryCleanupWithDelay(batchCount)
 					}
 				} else {
-				// 不支持批量处理的提供商：大幅降低并发度
+					// 不支持批量处理的提供商（减少并发度以避免文件句柄耗尽）
-				const limit = pLimit(8) // 从32降低到8，大幅减少并发压力
+					const limit = pLimit(3)
 				const abortController = new AbortController()
-				// 流式处理：分批处理并立即插入
+					for (let j = 0; j < batchChunks.length; j += embeddingBatchSize) {
-				for (let i = 0; i < contentChunks.length; i += batchSize) {
+						const embeddingBatch = batchChunks.slice(j, Math.min(j + embeddingBatchSize, batchChunks.length))
 					if (abortController.signal.aborted) {
 						throw new Error('Operation was aborted')
 					}
 					batchCount++
 					const batchChunks = contentChunks.slice(i, Math.min(i + batchSize, contentChunks.length))
 						const embeddedBatch: InsertVector[] = []
-					const tasks = batchChunks.map((chunk) =>
+						const tasks = embeddingBatch.map((chunk) =>
 							limit(async () => {
 							if (abortController.signal.aborted) {
 								throw new Error('Operation was aborted')
 							}
 								try {
 									await backOff(
 										async () => {
@ -349,38 +368,40 @@ export class VectorManager {
 										},
 										{
 											numOfAttempts: 3,
-										startingDelay: 1000, // 增加延迟
+											startingDelay: 1000,
 											timeMultiple: 2.0,
 											jitter: 'full',
 										},
 									)
 								} catch (error) {
 									console.error('Error in embedding task:', error)
 								// 不要立即中止，继续处理其他任务
 								}
 							}),
 						)
 						await Promise.all(tasks)
-					// 使用事务批量插入
+						// 第三步：立即存储
 						if (embeddedBatch.length > 0) {
 							await this.insertVectorsWithTransaction(embeddedBatch, embeddingModel)
-						// 清理批次数据
+							console.log(`Stored ${embeddedBatch.length} embedded chunks`)
 						embeddedBatch.length = 0
 						}
-					embeddingProgress.completed += batchChunks.length
+						embeddingProgress.completed += embeddingBatch.length
 						updateProgress?.({
 							completedChunks: embeddingProgress.completed,
-						totalChunks: contentChunks.length,
+							totalChunks: embeddingProgress.totalChunks,
 							totalFiles: filesToIndex.length,
 						})
 					// 定期内存清理和延迟
 					await this.memoryCleanupWithDelay(batchCount)
 					}
 				}
 				// 每批文件处理完后进行强制资源清理
 				await this.forceResourceCleanup()
 				// 额外延迟以允许系统释放文件句柄
 				await new Promise(resolve => setTimeout(resolve, 500))
 			}
 		} catch (error) {
 			if (
 				error instanceof LLMAPIKeyNotSetException ||
@ -395,8 +416,13 @@ export class VectorManager {
 				throw error
 			}
 		} finally {
-			// 最终清理
+			// 最终强制清理
-			this.forceGarbageCollection()
+			await this.forceResourceCleanup()
 		}
 		if (skippedFiles.length > 0) {
 			console.warn(`跳过了 ${skippedFiles.length} 个有问题的文件:`, skippedFiles)
 			new Notice(`跳过了 ${skippedFiles.length} 个有问题的文件`)
 		}
 	}
@ -472,9 +498,45 @@ export class VectorManager {
 		console.log("textSplitter chunkSize: ", options.chunkSize, "overlap: ", overlap)
 		const skippedFiles: string[] = []
-		const contentChunks: InsertVector[] = (
+		const embeddingProgress = { completed: 0, totalChunks: 0 }
 		// 分批处理文件，每批最多50个文件（减少以避免文件句柄耗尽）
 		const FILE_BATCH_SIZE = 50
 		// 减少批量大小以降低内存压力
 		const embeddingBatchSize = Math.min(options.batchSize, 10)
 		// 首先统计总的分块数量用于进度显示
 		let totalChunks = 0
 		for (let i = 0; i < filesToIndex.length; i += FILE_BATCH_SIZE) {
 			const fileBatch = filesToIndex.slice(i, Math.min(i + FILE_BATCH_SIZE, filesToIndex.length))
 			for (const file of fileBatch) {
 				try {
 					let fileContent = await this.app.vault.cachedRead(file)
 					fileContent = fileContent.replace(/\0/g, '')
 					const fileDocuments = await textSplitter.createDocuments([fileContent])
 					totalChunks += fileDocuments.length
 				} catch (error) {
 					// 统计阶段跳过错误文件
 				}
 			}
 		}
 		embeddingProgress.totalChunks = totalChunks
 		updateProgress?.({
 			completedChunks: 0,
 			totalChunks: totalChunks,
 			totalFiles: filesToIndex.length,
 		})
 		try {
 			for (let i = 0; i < filesToIndex.length; i += FILE_BATCH_SIZE) {
 				const fileBatch = filesToIndex.slice(i, Math.min(i + FILE_BATCH_SIZE, filesToIndex.length))
 				console.log(`Processing workspace file batch ${Math.floor(i / FILE_BATCH_SIZE) + 1}/${Math.ceil(filesToIndex.length / FILE_BATCH_SIZE)} (${fileBatch.length} files)`)
 				// 第一步：分块处理
 				const batchChunks = (
 					await Promise.all(
-				filesToIndex.map(async (file) => {
+						fileBatch.map(async (file) => {
 							try {
 								let fileContent = await this.app.vault.cachedRead(file)
 								// 清理null字节，防止PostgreSQL UTF8编码错误
@ -510,37 +572,23 @@ export class VectorManager {
 					)
 				).flat()
-		console.log("contentChunks: ", contentChunks.length)
+				if (batchChunks.length === 0) {
-
+					continue
 		if (skippedFiles.length > 0) {
 			console.warn(`跳过了 ${skippedFiles.length} 个有问题的文件:`, skippedFiles)
 			new Notice(`跳过了 ${skippedFiles.length} 个有问题的文件`)
 				}
-		updateProgress?.({
+				// 第二步：嵌入处理
-			completedChunks: 0,
+				console.log(`Embedding ${batchChunks.length} chunks for current workspace file batch`)
 			totalChunks: contentChunks.length,
 			totalFiles: filesToIndex.length,
 		})
 		const embeddingProgress = { completed: 0 }
 		// 减少批量大小以降低内存压力
 		const batchSize = Math.min(options.batchSize, 20) // 限制最大批量大小
 		let batchCount = 0
 		try {
 				if (embeddingModel.supportsBatch) {
-				// 支持批量处理的提供商：使用流式处理逻辑
+					// 支持批量处理的提供商
-				for (let i = 0; i < contentChunks.length; i += batchSize) {
+					for (let j = 0; j < batchChunks.length; j += embeddingBatchSize) {
-					batchCount++
+						const embeddingBatch = batchChunks.slice(j, Math.min(j + embeddingBatchSize, batchChunks.length))
 					const batchChunks = contentChunks.slice(i, Math.min(i + batchSize, contentChunks.length))
 						const embeddedBatch: InsertVector[] = []
 						await backOff(
 							async () => {
-							// 在嵌入之前处理 markdown，只处理一次
+								// 在嵌入之前处理 markdown
-							const cleanedBatchData = batchChunks.map(chunk => {
+								const cleanedBatchData = embeddingBatch.map(chunk => {
 									const cleanContent = removeMarkdown(chunk.content)
 									return { chunk, cleanContent }
 								}).filter(({ cleanContent }) => cleanContent && cleanContent.trim().length > 0)
@ -553,13 +601,13 @@ export class VectorManager {
 								const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts)
 								// 合并embedding结果到chunk数据
-							for (let j = 0; j < cleanedBatchData.length; j++) {
+								for (let k = 0; k < cleanedBatchData.length; k++) {
-								const { chunk, cleanContent } = cleanedBatchData[j]
+									const { chunk, cleanContent } = cleanedBatchData[k]
 									const embeddedChunk: InsertVector = {
 										path: chunk.path,
 										mtime: chunk.mtime,
 										content: cleanContent, // 使用已经清理过的内容
-									embedding: batchEmbeddings[j],
+										embedding: batchEmbeddings[k],
 										metadata: chunk.metadata,
 									}
 									embeddedBatch.push(embeddedChunk)
@ -567,49 +615,35 @@ export class VectorManager {
 							},
 							{
 								numOfAttempts: 3,
-							startingDelay: 1000, // 增加延迟
+								startingDelay: 1000,
 								timeMultiple: 2.0,
 								jitter: 'full',
 							},
 						)
-					// 使用事务批量插入，减少数据库连接压力
+						// 第三步：立即存储
 						if (embeddedBatch.length > 0) {
 							await this.insertVectorsWithTransaction(embeddedBatch, embeddingModel)
-						// 清理批次数据
+							console.log(`Stored ${embeddedBatch.length} embedded chunks for workspace`)
 						embeddedBatch.length = 0
 						}
-					embeddingProgress.completed += batchChunks.length
+						embeddingProgress.completed += embeddingBatch.length
 						updateProgress?.({
 							completedChunks: embeddingProgress.completed,
-						totalChunks: contentChunks.length,
+							totalChunks: embeddingProgress.totalChunks,
 							totalFiles: filesToIndex.length,
 						})
 					// 定期内存清理和延迟
 					await this.memoryCleanupWithDelay(batchCount)
 					}
 				} else {
-				// 不支持批量处理的提供商：大幅降低并发度
+					// 不支持批量处理的提供商（减少并发度以避免文件句柄耗尽）
-				const limit = pLimit(8) // 从32降低到8，大幅减少并发压力
+					const limit = pLimit(3)
 				const abortController = new AbortController()
-				// 流式处理：分批处理并立即插入
+					for (let j = 0; j < batchChunks.length; j += embeddingBatchSize) {
-				for (let i = 0; i < contentChunks.length; i += batchSize) {
+						const embeddingBatch = batchChunks.slice(j, Math.min(j + embeddingBatchSize, batchChunks.length))
 					if (abortController.signal.aborted) {
 						throw new Error('Operation was aborted')
 					}
 					batchCount++
 					const batchChunks = contentChunks.slice(i, Math.min(i + batchSize, contentChunks.length))
 						const embeddedBatch: InsertVector[] = []
-					const tasks = batchChunks.map((chunk) =>
+						const tasks = embeddingBatch.map((chunk) =>
 							limit(async () => {
 							if (abortController.signal.aborted) {
 								throw new Error('Operation was aborted')
 							}
 								try {
 									await backOff(
 										async () => {
@ -632,38 +666,40 @@ export class VectorManager {
 										},
 										{
 											numOfAttempts: 3,
-										startingDelay: 1000, // 增加延迟
+											startingDelay: 1000,
 											timeMultiple: 2.0,
 											jitter: 'full',
 										},
 									)
 								} catch (error) {
 									console.error('Error in embedding task:', error)
 								// 不要立即中止，继续处理其他任务
 								}
 							}),
 						)
 						await Promise.all(tasks)
-					// 使用事务批量插入
+						// 第三步：立即存储
 						if (embeddedBatch.length > 0) {
 							await this.insertVectorsWithTransaction(embeddedBatch, embeddingModel)
-						// 清理批次数据
+							console.log(`Stored ${embeddedBatch.length} embedded chunks for workspace`)
 						embeddedBatch.length = 0
 						}
-					embeddingProgress.completed += batchChunks.length
+						embeddingProgress.completed += embeddingBatch.length
 						updateProgress?.({
 							completedChunks: embeddingProgress.completed,
-						totalChunks: contentChunks.length,
+							totalChunks: embeddingProgress.totalChunks,
 							totalFiles: filesToIndex.length,
 						})
 					// 定期内存清理和延迟
 					await this.memoryCleanupWithDelay(batchCount)
 					}
 				}
 				// 每批文件处理完后进行强制资源清理
 				await this.forceResourceCleanup()
 				// 额外延迟以允许系统释放文件句柄
 				await new Promise(resolve => setTimeout(resolve, 500))
 			}
 		} catch (error) {
 			if (
 				error instanceof LLMAPIKeyNotSetException ||
@ -678,8 +714,13 @@ export class VectorManager {
 				throw error
 			}
 		} finally {
-			// 最终清理
+			// 最终强制清理
-			this.forceGarbageCollection()
+			await this.forceResourceCleanup()
 		}
 		if (skippedFiles.length > 0) {
 			console.warn(`跳过了 ${skippedFiles.length} 个有问题的文件:`, skippedFiles)
 			new Notice(`跳过了 ${skippedFiles.length} 个有问题的文件`)
 		}
 	}
@ -1034,14 +1075,26 @@ export class VectorManager {
 	// 增强的内存清理方法，增加延迟
 	private async memoryCleanupWithDelay(batchCount: number) {
-		// 每5批次强制垃圾回收和延迟
+		// 每3批次强制垃圾回收和延迟
-		if (batchCount % 5 === 0) {
+		if (batchCount % 3 === 0) {
 			this.forceGarbageCollection()
-			// 增加延迟让系统有时间处理
+			// 增加延迟让系统有时间处理和释放文件句柄
-			await new Promise(resolve => setTimeout(resolve, 500))
+			await new Promise(resolve => setTimeout(resolve, 1000))
 		}
 	}
 	// 强制内存和资源清理
 	private async forceResourceCleanup() {
 		// 多次垃圾回收
 		for (let i = 0; i < 5; i++) {
 			this.forceGarbageCollection()
 			await new Promise(resolve => setTimeout(resolve, 100))
 		}
 		// 额外延迟让系统释放资源
 		await new Promise(resolve => setTimeout(resolve, 500))
 	}
 	// 使用事务插入向量的方法
 	private async insertVectorsWithTransaction(
 		data: InsertVector[],