diff --git a/src/components/inline-edit/InlineEdit.tsx b/src/components/inline-edit/InlineEdit.tsx index 03d78ea..8dcbf35 100644 --- a/src/components/inline-edit/InlineEdit.tsx +++ b/src/components/inline-edit/InlineEdit.tsx @@ -258,6 +258,8 @@ export const InlineEdit: React.FC = ({ let fileContent: string; try { fileContent = await plugin.app.vault.cachedRead(activeFile); + // 清理null字节,防止PostgreSQL UTF8编码错误 + fileContent = fileContent.replace(/\0/g, ''); } catch (err) { const error = err as Error; console.error(t("inlineEdit.readFileError"), error.message); @@ -278,7 +280,9 @@ export const InlineEdit: React.FC = ({ return; } - const oldContent = await plugin.app.vault.read(activeFile); + let oldContent = await plugin.app.vault.read(activeFile); + // 清理null字节,防止PostgreSQL UTF8编码错误 + oldContent = oldContent.replace(/\0/g, ''); await plugin.app.workspace.getLeaf(true).setViewState({ type: APPLY_VIEW_TYPE, active: true, diff --git a/src/core/file-search/match/coreplugin-match.ts b/src/core/file-search/match/coreplugin-match.ts index e54172d..a82f73b 100644 --- a/src/core/file-search/match/coreplugin-match.ts +++ b/src/core/file-search/match/coreplugin-match.ts @@ -56,7 +56,9 @@ export async function matchSearchUsingCorePlugin( break; } - const content = await vault.cachedRead(file as TFile); + let content = await vault.cachedRead(file as TFile); + // 清理null字节,防止PostgreSQL UTF8编码错误 + content = content.replace(/\0/g, ''); const lines = content.split('\n'); // `fileMatches.result.content` holds an array of matches for the file. diff --git a/src/database/modules/vector/vector-manager.ts b/src/database/modules/vector/vector-manager.ts index a579f67..eb4511e 100644 --- a/src/database/modules/vector/vector-manager.ts +++ b/src/database/modules/vector/vector-manager.ts @@ -52,6 +52,29 @@ export class VectorManager { ) } + // 强制垃圾回收的辅助方法 + private forceGarbageCollection() { + try { + if (typeof global !== 'undefined' && global.gc) { + global.gc() + } else if (typeof window !== 'undefined' && (window as any).gc) { + (window as any).gc() + } + } catch (e) { + // 忽略垃圾回收错误 + } + } + + // 检查并清理内存的辅助方法 + private async memoryCleanup(batchCount: number) { + // 每10批次强制垃圾回收 + if (batchCount % 10 === 0) { + this.forceGarbageCollection() + // 短暂延迟让内存清理完成 + await new Promise(resolve => setTimeout(resolve, 100)) + } + } + async updateVaultIndex( embeddingModel: EmbeddingModel, options: { @@ -100,29 +123,43 @@ export class VectorManager { }, ) + const skippedFiles: string[] = [] const contentChunks: InsertVector[] = ( await Promise.all( filesToIndex.map(async (file) => { - const fileContent = await this.app.vault.cachedRead(file) - const fileDocuments = await textSplitter.createDocuments([ - fileContent, - ]) - return fileDocuments.map((chunk): InsertVector => { - return { - path: file.path, - mtime: file.stat.mtime, - content: chunk.pageContent, - embedding: [], - metadata: { - startLine: Number(chunk.metadata.loc.lines.from), - endLine: Number(chunk.metadata.loc.lines.to), - }, - } - }) + try { + let fileContent = await this.app.vault.cachedRead(file) + // 清理null字节,防止PostgreSQL UTF8编码错误 + fileContent = fileContent.replace(/\0/g, '') + const fileDocuments = await textSplitter.createDocuments([ + fileContent, + ]) + return fileDocuments.map((chunk): InsertVector => { + return { + path: file.path, + mtime: file.stat.mtime, + content: chunk.pageContent.replace(/\0/g, ''), // 再次清理,确保安全 + embedding: [], + metadata: { + startLine: Number(chunk.metadata.loc.lines.from), + endLine: Number(chunk.metadata.loc.lines.to), + }, + } + }) + } catch (error) { + console.warn(`跳过文件 ${file.path}:`, error.message) + skippedFiles.push(file.path) + return [] + } }), ) ).flat() + if (skippedFiles.length > 0) { + console.warn(`跳过了 ${skippedFiles.length} 个有问题的文件:`, skippedFiles) + new Notice(`跳过了 ${skippedFiles.length} 个有问题的文件`) + } + updateProgress?.({ completedChunks: 0, totalChunks: contentChunks.length, @@ -130,18 +167,22 @@ export class VectorManager { }) const embeddingProgress = { completed: 0 } - const embeddingChunks: InsertVector[] = [] - const insertBatchSize = 64 // 数据库插入批量大小 + // 减少批量大小以降低内存压力 + const insertBatchSize = 16 // 从64降低到16 + let batchCount = 0 try { if (embeddingModel.supportsBatch) { - // 支持批量处理的提供商:使用批量处理逻辑 - const embeddingBatchSize = 64 // API批量处理大小 + // 支持批量处理的提供商:使用流式处理逻辑 + const embeddingBatchSize = 16 // 从64降低到16 for (let i = 0; i < contentChunks.length; i += embeddingBatchSize) { + batchCount++ const batchChunks = contentChunks.slice(i, Math.min(i + embeddingBatchSize, contentChunks.length)) const batchTexts = batchChunks.map(chunk => chunk.content) + const embeddedBatch: InsertVector[] = [] + await backOff( async () => { const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts) @@ -155,80 +196,99 @@ export class VectorManager { embedding: batchEmbeddings[j], metadata: batchChunks[j].metadata, } - embeddingChunks.push(embeddedChunk) + embeddedBatch.push(embeddedChunk) } - - embeddingProgress.completed += batchChunks.length - updateProgress?.({ - completedChunks: embeddingProgress.completed, - totalChunks: contentChunks.length, - totalFiles: filesToIndex.length, - }) }, { - numOfAttempts: 5, - startingDelay: 1000, + numOfAttempts: 3, // 减少重试次数 + startingDelay: 500, // 减少延迟 timeMultiple: 1.5, jitter: 'full', }, ) + + // 立即插入当前批次,避免内存累积 + if (embeddedBatch.length > 0) { + await this.repository.insertVectors(embeddedBatch, embeddingModel) + // 清理批次数据 + embeddedBatch.length = 0 + } + + embeddingProgress.completed += batchChunks.length + updateProgress?.({ + completedChunks: embeddingProgress.completed, + totalChunks: contentChunks.length, + totalFiles: filesToIndex.length, + }) + + // 定期内存清理 + await this.memoryCleanup(batchCount) } } else { - // 不支持批量处理的提供商:使用原来的逐个处理逻辑 - const limit = pLimit(50) + // 不支持批量处理的提供商:使用流式处理逻辑 + const limit = pLimit(10) // 从50降低到10,减少并发压力 const abortController = new AbortController() - const tasks = contentChunks.map((chunk) => - limit(async () => { - if (abortController.signal.aborted) { - throw new Error('Operation was aborted') - } - try { - await backOff( - async () => { - const embedding = await embeddingModel.getEmbedding(chunk.content) - const embeddedChunk = { - path: chunk.path, - mtime: chunk.mtime, - content: chunk.content, - embedding, - metadata: chunk.metadata, - } - embeddingChunks.push(embeddedChunk) - embeddingProgress.completed++ - updateProgress?.({ - completedChunks: embeddingProgress.completed, - totalChunks: contentChunks.length, - totalFiles: filesToIndex.length, - }) - }, - { - numOfAttempts: 5, - startingDelay: 1000, - timeMultiple: 1.5, - jitter: 'full', - }, - ) - } catch (error) { - abortController.abort() - throw error - } - }), - ) - await Promise.all(tasks) - } - - // all embedding generated, batch insert - if (embeddingChunks.length > 0) { - // batch insert all vectors - let inserted = 0 - while (inserted < embeddingChunks.length) { - const chunksToInsert = embeddingChunks.slice( - inserted, - Math.min(inserted + insertBatchSize, embeddingChunks.length) + // 流式处理:分批处理并立即插入 + for (let i = 0; i < contentChunks.length; i += insertBatchSize) { + if (abortController.signal.aborted) { + throw new Error('Operation was aborted') + } + + batchCount++ + const batchChunks = contentChunks.slice(i, Math.min(i + insertBatchSize, contentChunks.length)) + const embeddedBatch: InsertVector[] = [] + + const tasks = batchChunks.map((chunk) => + limit(async () => { + if (abortController.signal.aborted) { + throw new Error('Operation was aborted') + } + try { + await backOff( + async () => { + const embedding = await embeddingModel.getEmbedding(chunk.content) + const embeddedChunk = { + path: chunk.path, + mtime: chunk.mtime, + content: chunk.content, + embedding, + metadata: chunk.metadata, + } + embeddedBatch.push(embeddedChunk) + }, + { + numOfAttempts: 3, // 减少重试次数 + startingDelay: 500, // 减少延迟 + timeMultiple: 1.5, + jitter: 'full', + }, + ) + } catch (error) { + abortController.abort() + throw error + } + }), ) - await this.repository.insertVectors(chunksToInsert, embeddingModel) - inserted += chunksToInsert.length + + await Promise.all(tasks) + + // 立即插入当前批次 + if (embeddedBatch.length > 0) { + await this.repository.insertVectors(embeddedBatch, embeddingModel) + // 清理批次数据 + embeddedBatch.length = 0 + } + + embeddingProgress.completed += batchChunks.length + updateProgress?.({ + completedChunks: embeddingProgress.completed, + totalChunks: contentChunks.length, + totalFiles: filesToIndex.length, + }) + + // 定期内存清理 + await this.memoryCleanup(batchCount) } } } catch (error) { @@ -244,6 +304,9 @@ export class VectorManager { console.error('Error embedding chunks:', error) throw error } + } finally { + // 最终清理 + this.forceGarbageCollection() } } @@ -252,125 +315,160 @@ export class VectorManager { chunkSize: number, file: TFile ) { - - // Delete existing vectors for the files - await this.repository.deleteVectorsForSingleFile( - file.path, - embeddingModel, - ) - - // Embed the files - const textSplitter = RecursiveCharacterTextSplitter.fromLanguage( - 'markdown', - { - chunkSize, - }, - ) - const fileContent = await this.app.vault.cachedRead(file) - const fileDocuments = await textSplitter.createDocuments([ - fileContent, - ]) - - const contentChunks: InsertVector[] = fileDocuments.map((chunk): InsertVector => { - return { - path: file.path, - mtime: file.stat.mtime, - content: chunk.pageContent, - embedding: [], - metadata: { - startLine: Number(chunk.metadata.loc.lines.from), - endLine: Number(chunk.metadata.loc.lines.to), - }, - } - }) - - const embeddingChunks: InsertVector[] = [] - const insertBatchSize = 64 // 数据库插入批量大小 - try { - if (embeddingModel.supportsBatch) { - // 支持批量处理的提供商:使用批量处理逻辑 - const embeddingBatchSize = 64 // API批量处理大小 - - for (let i = 0; i < contentChunks.length; i += embeddingBatchSize) { - console.log(`Embedding batch ${i / embeddingBatchSize + 1} of ${Math.ceil(contentChunks.length / embeddingBatchSize)}`) - const batchChunks = contentChunks.slice(i, Math.min(i + embeddingBatchSize, contentChunks.length)) - const batchTexts = batchChunks.map(chunk => chunk.content) - - await backOff( - async () => { - const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts) - - // 合并embedding结果到chunk数据 - for (let j = 0; j < batchChunks.length; j++) { - const embeddedChunk: InsertVector = { - path: batchChunks[j].path, - mtime: batchChunks[j].mtime, - content: batchChunks[j].content, - embedding: batchEmbeddings[j], - metadata: batchChunks[j].metadata, - } - embeddingChunks.push(embeddedChunk) - } - }, - { - numOfAttempts: 5, - startingDelay: 1000, - timeMultiple: 1.5, - jitter: 'full', - }, - ) + // Delete existing vectors for the files + await this.repository.deleteVectorsForSingleFile( + file.path, + embeddingModel, + ) + + // Embed the files + const textSplitter = RecursiveCharacterTextSplitter.fromLanguage( + 'markdown', + { + chunkSize, + }, + ) + let fileContent = await this.app.vault.cachedRead(file) + // 清理null字节,防止PostgreSQL UTF8编码错误 + fileContent = fileContent.replace(/\0/g, '') + const fileDocuments = await textSplitter.createDocuments([ + fileContent, + ]) + + const contentChunks: InsertVector[] = fileDocuments.map((chunk): InsertVector => { + return { + path: file.path, + mtime: file.stat.mtime, + content: chunk.pageContent.replace(/\0/g, ''), // 再次清理,确保安全 + embedding: [], + metadata: { + startLine: Number(chunk.metadata.loc.lines.from), + endLine: Number(chunk.metadata.loc.lines.to), + }, } - } else { - // 不支持批量处理的提供商:使用原来的逐个处理逻辑 - const limit = pLimit(50) - const abortController = new AbortController() - const tasks = contentChunks.map((chunk) => - limit(async () => { + }) + + // 减少批量大小以降低内存压力 + const insertBatchSize = 16 // 从64降低到16 + let batchCount = 0 + + try { + if (embeddingModel.supportsBatch) { + // 支持批量处理的提供商:使用流式处理逻辑 + const embeddingBatchSize = 16 // 从64降低到16 + + for (let i = 0; i < contentChunks.length; i += embeddingBatchSize) { + batchCount++ + console.log(`Embedding batch ${batchCount} of ${Math.ceil(contentChunks.length / embeddingBatchSize)}`) + const batchChunks = contentChunks.slice(i, Math.min(i + embeddingBatchSize, contentChunks.length)) + const batchTexts = batchChunks.map(chunk => chunk.content) + + const embeddedBatch: InsertVector[] = [] + + await backOff( + async () => { + const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts) + + // 合并embedding结果到chunk数据 + for (let j = 0; j < batchChunks.length; j++) { + const embeddedChunk: InsertVector = { + path: batchChunks[j].path, + mtime: batchChunks[j].mtime, + content: batchChunks[j].content, + embedding: batchEmbeddings[j], + metadata: batchChunks[j].metadata, + } + embeddedBatch.push(embeddedChunk) + } + }, + { + numOfAttempts: 3, // 减少重试次数 + startingDelay: 500, // 减少延迟 + timeMultiple: 1.5, + jitter: 'full', + }, + ) + + // 立即插入当前批次 + if (embeddedBatch.length > 0) { + await this.repository.insertVectors(embeddedBatch, embeddingModel) + // 清理批次数据 + embeddedBatch.length = 0 + } + + // 定期内存清理 + await this.memoryCleanup(batchCount) + } + } else { + // 不支持批量处理的提供商:使用流式处理逻辑 + const limit = pLimit(10) // 从50降低到10 + const abortController = new AbortController() + + // 流式处理:分批处理并立即插入 + for (let i = 0; i < contentChunks.length; i += insertBatchSize) { if (abortController.signal.aborted) { throw new Error('Operation was aborted') } - try { - await backOff( - async () => { - const embedding = await embeddingModel.getEmbedding(chunk.content) - const embeddedChunk = { - path: chunk.path, - mtime: chunk.mtime, - content: chunk.content, - embedding, - metadata: chunk.metadata, - } - embeddingChunks.push(embeddedChunk) - }, - { - numOfAttempts: 5, - startingDelay: 1000, - timeMultiple: 1.5, - jitter: 'full', - }, - ) - } catch (error) { - abortController.abort() - throw error + + batchCount++ + const batchChunks = contentChunks.slice(i, Math.min(i + insertBatchSize, contentChunks.length)) + const embeddedBatch: InsertVector[] = [] + + const tasks = batchChunks.map((chunk) => + limit(async () => { + if (abortController.signal.aborted) { + throw new Error('Operation was aborted') + } + try { + await backOff( + async () => { + const embedding = await embeddingModel.getEmbedding(chunk.content) + const embeddedChunk = { + path: chunk.path, + mtime: chunk.mtime, + content: chunk.content, + embedding, + metadata: chunk.metadata, + } + embeddedBatch.push(embeddedChunk) + }, + { + numOfAttempts: 3, // 减少重试次数 + startingDelay: 500, // 减少延迟 + timeMultiple: 1.5, + jitter: 'full', + }, + ) + } catch (error) { + abortController.abort() + throw error + } + }), + ) + + await Promise.all(tasks) + + // 立即插入当前批次 + if (embeddedBatch.length > 0) { + await this.repository.insertVectors(embeddedBatch, embeddingModel) + // 清理批次数据 + embeddedBatch.length = 0 } - }), - ) - - await Promise.all(tasks) - } - // all embedding generated, batch insert - if (embeddingChunks.length > 0) { - let inserted = 0 - while (inserted < embeddingChunks.length) { - const chunksToInsert = embeddingChunks.slice(inserted, Math.min(inserted + insertBatchSize, embeddingChunks.length)) - await this.repository.insertVectors(chunksToInsert, embeddingModel) - inserted += chunksToInsert.length + // 定期内存清理 + await this.memoryCleanup(batchCount) + } } + } catch (error) { + console.error('Error embedding chunks:', error) + } finally { + // 最终清理 + this.forceGarbageCollection() } } catch (error) { - console.error('Error embedding chunks:', error) + console.warn(`跳过文件 ${file.path}:`, error.message) + new Notice(`跳过文件 ${file.name}: ${error.message}`) } } @@ -424,25 +522,32 @@ export class VectorManager { // Check for updated or new files filesToIndex = await Promise.all( filesToIndex.map(async (file) => { - const fileChunks = await this.repository.getVectorsByFilePath( - file.path, - embeddingModel, - ) - if (fileChunks.length === 0) { - // File is not indexed, so we need to index it - const fileContent = await this.app.vault.cachedRead(file) - if (fileContent.length === 0) { - // Ignore empty files - return null + try { + const fileChunks = await this.repository.getVectorsByFilePath( + file.path, + embeddingModel, + ) + if (fileChunks.length === 0) { + // File is not indexed, so we need to index it + let fileContent = await this.app.vault.cachedRead(file) + // 清理null字节,防止PostgreSQL UTF8编码错误 + fileContent = fileContent.replace(/\0/g, '') + if (fileContent.length === 0) { + // Ignore empty files + return null + } + return file } - return file + const outOfDate = file.stat.mtime > fileChunks[0].mtime + if (outOfDate) { + // File has changed, so we need to re-index it + return file + } + return null + } catch (error) { + console.warn(`跳过文件 ${file.path}:`, error.message) + return null } - const outOfDate = file.stat.mtime > fileChunks[0].mtime - if (outOfDate) { - // File has changed, so we need to re-index it - return file - } - return null }), ).then((files) => files.filter(Boolean)) diff --git a/src/database/modules/vector/vector-repository.ts b/src/database/modules/vector/vector-repository.ts index c76b499..5068f3a 100644 --- a/src/database/modules/vector/vector-repository.ts +++ b/src/database/modules/vector/vector-repository.ts @@ -102,7 +102,7 @@ export class VectorRepository { const params = data.flatMap(vector => [ vector.path, vector.mtime, - vector.content, + vector.content.replace(/\0/g, ''), // 清理null字节 `[${vector.embedding.join(',')}]`, // 转换为PostgreSQL vector格式 vector.metadata ])