From 5c24993ab9ae11588a0c8fe6382d62c4943700d7 Mon Sep 17 00:00:00 2001 From: duanfuxiang Date: Sat, 14 Jun 2025 13:21:29 +0800 Subject: [PATCH] update vector query time, remove delete file process --- src/core/rag/rag-engine.ts | 9 ++++++--- src/database/modules/vector/vector-manager.ts | 17 ++++++++++++----- src/database/sql.ts | 16 ++++++++++++++++ src/utils/prompt-generator.ts | 17 +++++++++-------- 4 files changed, 43 insertions(+), 16 deletions(-) diff --git a/src/core/rag/rag-engine.ts b/src/core/rag/rag-engine.ts index df602dd..be6d0ef 100644 --- a/src/core/rag/rag-engine.ts +++ b/src/core/rag/rag-engine.ts @@ -53,6 +53,7 @@ export class RAGEngine { throw new Error('Embedding model is not set') } await this.initializeDimension() + console.log("updateVaultIndex") await this.vectorManager.updateVaultIndex( this.embeddingModel, @@ -69,6 +70,7 @@ export class RAGEngine { }) }, ) + console.log("updateVaultIndex done") this.initialized = true } @@ -121,9 +123,10 @@ export class RAGEngine { await this.initializeDimension() - if (!this.initialized) { - await this.updateVaultIndex({ reindexAll: false }, onQueryProgressChange) - } + // if (!this.initialized) { + // console.log("need to updateVaultIndex") + // await this.updateVaultIndex({ reindexAll: false }, onQueryProgressChange) + // } const queryEmbedding = await this.getEmbedding(query) onQueryProgressChange?.({ type: 'querying', diff --git a/src/database/modules/vector/vector-manager.ts b/src/database/modules/vector/vector-manager.ts index eb4511e..188d176 100644 --- a/src/database/modules/vector/vector-manager.ts +++ b/src/database/modules/vector/vector-manager.ts @@ -85,6 +85,7 @@ export class VectorManager { }, updateProgress?: (indexProgress: IndexProgress) => void, ): Promise { + console.log("updateVaultIndex start") let filesToIndex: TFile[] if (options.reindexAll) { filesToIndex = await this.getFilesToIndex({ @@ -95,6 +96,7 @@ export class VectorManager { }) await this.repository.clearAllVectors(embeddingModel) } else { + console.log("updateVaultIndex cleanVectorsForDeletedFiles") await this.cleanVectorsForDeletedFiles(embeddingModel) filesToIndex = await this.getFilesToIndex({ embeddingModel: embeddingModel, @@ -168,13 +170,13 @@ export class VectorManager { const embeddingProgress = { completed: 0 } // 减少批量大小以降低内存压力 - const insertBatchSize = 16 // 从64降低到16 + const insertBatchSize = 32 let batchCount = 0 try { if (embeddingModel.supportsBatch) { // 支持批量处理的提供商:使用流式处理逻辑 - const embeddingBatchSize = 16 // 从64降低到16 + const embeddingBatchSize = 32 for (let i = 0; i < contentChunks.length; i += embeddingBatchSize) { batchCount++ @@ -226,7 +228,7 @@ export class VectorManager { } } else { // 不支持批量处理的提供商:使用流式处理逻辑 - const limit = pLimit(10) // 从50降低到10,减少并发压力 + const limit = pLimit(32) // 从50降低到10,减少并发压力 const abortController = new AbortController() // 流式处理:分批处理并立即插入 @@ -482,7 +484,9 @@ export class VectorManager { private async cleanVectorsForDeletedFiles( embeddingModel: EmbeddingModel, ) { + console.log("cleanVectorsForDeletedFiles start") const indexedFilePaths = await this.repository.getAllIndexedFilePaths(embeddingModel) + console.log("indexedFilePaths: ", indexedFilePaths) const needToDelete = indexedFilePaths.filter(filePath => !this.app.vault.getAbstractFileByPath(filePath)) if (needToDelete.length > 0) { await this.repository.deleteVectorsForMultipleFiles( @@ -490,6 +494,7 @@ export class VectorManager { embeddingModel, ) } + console.log("cleanVectorsForDeletedFiles done") } private async getFilesToIndex({ @@ -502,7 +507,8 @@ export class VectorManager { excludePatterns: string[] includePatterns: string[] reindexAll?: boolean - }): Promise { + }): Promise { + console.log("getFilesToIndex") let filesToIndex = this.app.vault.getMarkdownFiles() filesToIndex = filesToIndex.filter((file) => { @@ -518,7 +524,7 @@ export class VectorManager { if (reindexAll) { return filesToIndex } - + console.log("filesToIndex: ", filesToIndex) // Check for updated or new files filesToIndex = await Promise.all( filesToIndex.map(async (file) => { @@ -541,6 +547,7 @@ export class VectorManager { const outOfDate = file.stat.mtime > fileChunks[0].mtime if (outOfDate) { // File has changed, so we need to re-index it + console.log("File has changed, so we need to re-index it", file.path) return file } return null diff --git a/src/database/sql.ts b/src/database/sql.ts index cc3fb58..8f61434 100644 --- a/src/database/sql.ts +++ b/src/database/sql.ts @@ -76,6 +76,22 @@ export const migrations: Record = { CREATE INDEX IF NOT EXISTS "embeddingIndex_384" ON "embeddings_384" USING hnsw ("embedding" vector_cosine_ops); + + -- Create B-tree indexes for path field to optimize file path queries + CREATE INDEX IF NOT EXISTS "pathIndex_1536" + ON "embeddings_1536" ("path"); + + CREATE INDEX IF NOT EXISTS "pathIndex_1024" + ON "embeddings_1024" ("path"); + + CREATE INDEX IF NOT EXISTS "pathIndex_768" + ON "embeddings_768" ("path"); + + CREATE INDEX IF NOT EXISTS "pathIndex_512" + ON "embeddings_512" ("path"); + + CREATE INDEX IF NOT EXISTS "pathIndex_384" + ON "embeddings_384" ("path"); ` }, template: { diff --git a/src/utils/prompt-generator.ts b/src/utils/prompt-generator.ts index fa9aab6..1d5e33d 100644 --- a/src/utils/prompt-generator.ts +++ b/src/utils/prompt-generator.ts @@ -465,17 +465,17 @@ export class PromptGenerator { this.app ) - // 为文件夹内容创建Markdown文件 - const markdownFilePath = await this.createMarkdownFileForContent( - `${folder.path}/folder-contents`, - content, - false - ) + // // 为文件夹内容创建Markdown文件 + // const markdownFilePath = await this.createMarkdownFileForContent( + // `${folder.path}/folder-contents`, + // content, + // false + // ) completedFolders++ folderContents.push(`\n${content}\n`) - folderContentsForProgress.push({ path: markdownFilePath, content }) - allFileReadResults.push({ path: markdownFilePath, content }) + folderContentsForProgress.push({ path: folder.path, content }) + allFileReadResults.push({ path: folder.path, content }) } // 文件夹读取完成(如果之前没有文件需要读取) @@ -665,6 +665,7 @@ export class PromptGenerator { } } if (isOverThreshold) { + console.log("isOverThreshold", isOverThreshold) fileContentsPrompts = files.map((file) => { return `\n(Content omitted due to token limit. Relevant sections will be provided by semantic search below.)\n` }).join('\n')