import { backOff } from 'exponential-backoff'; import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; import { minimatch } from 'minimatch'; import { App, Notice, TFile } from 'obsidian'; import pLimit from 'p-limit'; import removeMarkdown from 'remove-markdown'; import { IndexProgress } from '../../../components/chat-view/QueryProgress'; import { LLMAPIKeyInvalidException, LLMAPIKeyNotSetException, LLMBaseUrlNotSetException, LLMRateLimitExceededException, } from '../../../core/llm/exception'; import { InsertVector, SelectVector } from '../../../database/schema'; import { EmbeddingModel } from '../../../types/embedding'; import { getFilesWithTag } from '../../../utils/glob-utils'; import { openSettingsModalWithError } from '../../../utils/open-settings-modal'; import { DBManager } from '../../database-manager'; import { Workspace } from '../../json/workspace/types'; import { VectorRepository } from './vector-repository'; export class VectorManager { private app: App private repository: VectorRepository private dbManager: DBManager constructor(app: App, dbManager: DBManager) { this.app = app this.dbManager = dbManager this.repository = new VectorRepository(app, dbManager.getPgClient()) } async performSimilaritySearch( queryVector: number[], embeddingModel: EmbeddingModel, options: { minSimilarity: number limit: number scope?: { files: string[] folders: string[] } }, ): Promise< (Omit & { similarity: number })[] > { return await this.repository.performSimilaritySearch( queryVector, embeddingModel, options, ) } async getWorkspaceStatistics( embeddingModel: EmbeddingModel, workspace?: Workspace ): Promise<{ totalFiles: number totalChunks: number }> { // 构建工作区范围 let scope: { files: string[], folders: string[] } | undefined if (workspace) { const folders: string[] = [] const files: string[] = [] // 处理工作区中的文件夹和标签 for (const item of workspace.content) { if (item.type === 'folder') { folders.push(item.content) } else if (item.type === 'tag') { // 获取标签对应的所有文件 const tagFiles = getFilesWithTag(item.content, this.app) files.push(...tagFiles) } } // 只有当有文件夹或文件时才设置 scope if (folders.length > 0 || files.length > 0) { scope = { files, folders } } } if (scope) { return await this.repository.getWorkspaceStatistics(embeddingModel, scope) } else { return await this.repository.getVaultStatistics(embeddingModel) } } async getVaultStatistics(embeddingModel: EmbeddingModel): Promise<{ totalFiles: number totalChunks: number }> { return await this.repository.getVaultStatistics(embeddingModel) } // 强制垃圾回收的辅助方法 private forceGarbageCollection() { try { if (typeof global !== 'undefined' && global.gc) { global.gc() } else if (typeof window !== 'undefined' && (window as any).gc) { ((window as any).gc as () => void)(); } } catch (e) { // 忽略垃圾回收错误 } } // 检查并清理内存的辅助方法 private async memoryCleanup(batchCount: number) { // 每10批次强制垃圾回收 if (batchCount % 10 === 0) { this.forceGarbageCollection() // 短暂延迟让内存清理完成 await new Promise(resolve => setTimeout(resolve, 100)) } } async updateVaultIndex( embeddingModel: EmbeddingModel, options: { chunkSize: number batchSize: number excludePatterns: string[] includePatterns: string[] reindexAll?: boolean }, updateProgress?: (indexProgress: IndexProgress) => void, ): Promise { let filesToIndex: TFile[] if (options.reindexAll) { console.log("updateVaultIndex reindexAll") filesToIndex = await this.getFilesToIndex({ embeddingModel: embeddingModel, excludePatterns: options.excludePatterns, includePatterns: options.includePatterns, reindexAll: true, }) await this.repository.clearAllVectors(embeddingModel) } else { console.log("updateVaultIndex for update files") await this.cleanVectorsForDeletedFiles(embeddingModel) console.log("updateVaultIndex cleanVectorsForDeletedFiles") filesToIndex = await this.getFilesToIndex({ embeddingModel: embeddingModel, excludePatterns: options.excludePatterns, includePatterns: options.includePatterns, }) console.log("get files to index: ", filesToIndex.length) await this.repository.deleteVectorsForMultipleFiles( filesToIndex.map((file) => file.path), embeddingModel, ) console.log("delete vectors for multiple files: ", filesToIndex.length) } console.log("get files to index: ", filesToIndex.length) if (filesToIndex.length === 0) { return } // Embed the files const overlap = Math.floor(options.chunkSize * 0.15) const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: options.chunkSize, chunkOverlap: overlap, separators: [ "\n\n", "\n", ".", ",", " ", "\u200b", // Zero-width space "\uff0c", // Fullwidth comma "\u3001", // Ideographic comma "\uff0e", // Fullwidth full stop "\u3002", // Ideographic full stop "", ], }); console.log("textSplitter chunkSize: ", options.chunkSize, "overlap: ", overlap) const skippedFiles: string[] = [] const contentChunks: InsertVector[] = ( await Promise.all( filesToIndex.map(async (file) => { try { let fileContent = await this.app.vault.cachedRead(file) // 清理null字节,防止PostgreSQL UTF8编码错误 fileContent = fileContent.replace(/\0/g, '') const fileDocuments = await textSplitter.createDocuments([ fileContent, ]) return fileDocuments .map((chunk): InsertVector | null => { // 保存原始内容,不在此处调用 removeMarkdown const rawContent = chunk.pageContent.replace(/\0/g, '') if (!rawContent || rawContent.trim().length === 0) { return null } return { path: file.path, mtime: file.stat.mtime, content: rawContent, // 保存原始内容 embedding: [], metadata: { startLine: Number(chunk.metadata.loc.lines.from), endLine: Number(chunk.metadata.loc.lines.to), }, } }) .filter((chunk): chunk is InsertVector => chunk !== null) } catch (error) { console.warn(`跳过文件 ${file.path}:`, error.message) skippedFiles.push(file.path) return [] } }), ) ).flat() console.log("contentChunks: ", contentChunks.length) if (skippedFiles.length > 0) { console.warn(`跳过了 ${skippedFiles.length} 个有问题的文件:`, skippedFiles) new Notice(`跳过了 ${skippedFiles.length} 个有问题的文件`) } updateProgress?.({ completedChunks: 0, totalChunks: contentChunks.length, totalFiles: filesToIndex.length, }) const embeddingProgress = { completed: 0 } // 减少批量大小以降低内存压力 const batchSize = options.batchSize let batchCount = 0 try { if (embeddingModel.supportsBatch) { // 支持批量处理的提供商:使用流式处理逻辑 for (let i = 0; i < contentChunks.length; i += batchSize) { batchCount++ const batchChunks = contentChunks.slice(i, Math.min(i + batchSize, contentChunks.length)) const embeddedBatch: InsertVector[] = [] await backOff( async () => { // 在嵌入之前处理 markdown const cleanedBatchData = batchChunks.map(chunk => { const cleanContent = removeMarkdown(chunk.content) return { chunk, cleanContent } }).filter(({ cleanContent }) => cleanContent && cleanContent.trim().length > 0) if (cleanedBatchData.length === 0) { return } const batchTexts = cleanedBatchData.map(({ cleanContent }) => cleanContent) const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts) // 合并embedding结果到chunk数据 for (let j = 0; j < cleanedBatchData.length; j++) { const { chunk, cleanContent } = cleanedBatchData[j] const embeddedChunk: InsertVector = { path: chunk.path, mtime: chunk.mtime, content: cleanContent, // 使用已经清理过的内容 embedding: batchEmbeddings[j], metadata: chunk.metadata, } embeddedBatch.push(embeddedChunk) } }, { numOfAttempts: 3, // 减少重试次数 startingDelay: 500, // 减少延迟 timeMultiple: 1.5, jitter: 'full', }, ) // 立即插入当前批次,避免内存累积 if (embeddedBatch.length > 0) { await this.repository.insertVectors(embeddedBatch, embeddingModel) // 清理批次数据 embeddedBatch.length = 0 } embeddingProgress.completed += batchChunks.length updateProgress?.({ completedChunks: embeddingProgress.completed, totalChunks: contentChunks.length, totalFiles: filesToIndex.length, }) // 定期内存清理 await this.memoryCleanup(batchCount) } } else { // 不支持批量处理的提供商:使用流式处理逻辑 const limit = pLimit(32) // 从50降低到10,减少并发压力 const abortController = new AbortController() // 流式处理:分批处理并立即插入 for (let i = 0; i < contentChunks.length; i += batchSize) { if (abortController.signal.aborted) { throw new Error('Operation was aborted') } batchCount++ const batchChunks = contentChunks.slice(i, Math.min(i + batchSize, contentChunks.length)) const embeddedBatch: InsertVector[] = [] const tasks = batchChunks.map((chunk) => limit(async () => { if (abortController.signal.aborted) { throw new Error('Operation was aborted') } try { await backOff( async () => { // 在嵌入之前处理 markdown const cleanContent = removeMarkdown(chunk.content).replace(/\0/g, '') // 跳过清理后为空的内容 if (!cleanContent || cleanContent.trim().length === 0) { return } const embedding = await embeddingModel.getEmbedding(cleanContent) const embeddedChunk = { path: chunk.path, mtime: chunk.mtime, content: cleanContent, // 使用清理后的内容 embedding, metadata: chunk.metadata, } embeddedBatch.push(embeddedChunk) }, { numOfAttempts: 3, // 减少重试次数 startingDelay: 500, // 减少延迟 timeMultiple: 1.5, jitter: 'full', }, ) } catch (error) { abortController.abort() throw error } }), ) await Promise.all(tasks) // 立即插入当前批次 if (embeddedBatch.length > 0) { await this.repository.insertVectors(embeddedBatch, embeddingModel) // 清理批次数据 embeddedBatch.length = 0 } embeddingProgress.completed += batchChunks.length updateProgress?.({ completedChunks: embeddingProgress.completed, totalChunks: contentChunks.length, totalFiles: filesToIndex.length, }) // 定期内存清理 await this.memoryCleanup(batchCount) } } } catch (error) { if ( error instanceof LLMAPIKeyNotSetException || error instanceof LLMAPIKeyInvalidException || error instanceof LLMBaseUrlNotSetException ) { openSettingsModalWithError(this.app, error.message) } else if (error instanceof LLMRateLimitExceededException) { new Notice(error.message) } else { console.error('Error embedding chunks:', error) throw error } } finally { // 最终清理 this.forceGarbageCollection() } } async updateWorkspaceIndex( embeddingModel: EmbeddingModel, workspace: Workspace, options: { chunkSize: number batchSize: number excludePatterns: string[] includePatterns: string[] reindexAll?: boolean }, updateProgress?: (indexProgress: IndexProgress) => void, ): Promise { let filesToIndex: TFile[] if (options.reindexAll) { console.log("updateWorkspaceIndex reindexAll") filesToIndex = await this.getFilesToIndexInWorkspace({ embeddingModel: embeddingModel, workspace: workspace, excludePatterns: options.excludePatterns, includePatterns: options.includePatterns, reindexAll: true, }) // 只清理工作区相关的向量,而不是全部 const workspaceFilePaths = filesToIndex.map((file) => file.path) if (workspaceFilePaths.length > 0) { await this.repository.deleteVectorsForMultipleFiles(workspaceFilePaths, embeddingModel) } } else { console.log("updateWorkspaceIndex for update files") await this.cleanVectorsForDeletedFiles(embeddingModel) console.log("updateWorkspaceIndex cleanVectorsForDeletedFiles") filesToIndex = await this.getFilesToIndexInWorkspace({ embeddingModel: embeddingModel, workspace: workspace, excludePatterns: options.excludePatterns, includePatterns: options.includePatterns, }) console.log("get workspace files to index: ", filesToIndex.length) await this.repository.deleteVectorsForMultipleFiles( filesToIndex.map((file) => file.path), embeddingModel, ) console.log("delete vectors for workspace files: ", filesToIndex.length) } console.log("get workspace files to index: ", filesToIndex.length) if (filesToIndex.length === 0) { return } // Embed the files (使用与 updateVaultIndex 相同的逻辑) const overlap = Math.floor(options.chunkSize * 0.15) const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: options.chunkSize, chunkOverlap: overlap, separators: [ "\n\n", "\n", ".", ",", " ", "\u200b", // Zero-width space "\uff0c", // Fullwidth comma "\u3001", // Ideographic comma "\uff0e", // Fullwidth full stop "\u3002", // Ideographic full stop "", ], }); console.log("textSplitter chunkSize: ", options.chunkSize, "overlap: ", overlap) const skippedFiles: string[] = [] const contentChunks: InsertVector[] = ( await Promise.all( filesToIndex.map(async (file) => { try { let fileContent = await this.app.vault.cachedRead(file) // 清理null字节,防止PostgreSQL UTF8编码错误 fileContent = fileContent.replace(/\0/g, '') const fileDocuments = await textSplitter.createDocuments([ fileContent, ]) return fileDocuments .map((chunk): InsertVector | null => { // 保存原始内容,不在此处调用 removeMarkdown const rawContent = chunk.pageContent.replace(/\0/g, '') if (!rawContent || rawContent.trim().length === 0) { return null } return { path: file.path, mtime: file.stat.mtime, content: rawContent, // 保存原始内容 embedding: [], metadata: { startLine: Number(chunk.metadata.loc.lines.from), endLine: Number(chunk.metadata.loc.lines.to), }, } }) .filter((chunk): chunk is InsertVector => chunk !== null) } catch (error) { console.warn(`跳过文件 ${file.path}:`, error.message) skippedFiles.push(file.path) return [] } }), ) ).flat() console.log("contentChunks: ", contentChunks.length) if (skippedFiles.length > 0) { console.warn(`跳过了 ${skippedFiles.length} 个有问题的文件:`, skippedFiles) new Notice(`跳过了 ${skippedFiles.length} 个有问题的文件`) } updateProgress?.({ completedChunks: 0, totalChunks: contentChunks.length, totalFiles: filesToIndex.length, }) const embeddingProgress = { completed: 0 } // 减少批量大小以降低内存压力 const batchSize = options.batchSize let batchCount = 0 try { if (embeddingModel.supportsBatch) { // 支持批量处理的提供商:使用流式处理逻辑 for (let i = 0; i < contentChunks.length; i += batchSize) { batchCount++ const batchChunks = contentChunks.slice(i, Math.min(i + batchSize, contentChunks.length)) const embeddedBatch: InsertVector[] = [] await backOff( async () => { // 在嵌入之前处理 markdown,只处理一次 const cleanedBatchData = batchChunks.map(chunk => { const cleanContent = removeMarkdown(chunk.content) return { chunk, cleanContent } }).filter(({ cleanContent }) => cleanContent && cleanContent.trim().length > 0) if (cleanedBatchData.length === 0) { return } const batchTexts = cleanedBatchData.map(({ cleanContent }) => cleanContent) const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts) // 合并embedding结果到chunk数据 for (let j = 0; j < cleanedBatchData.length; j++) { const { chunk, cleanContent } = cleanedBatchData[j] const embeddedChunk: InsertVector = { path: chunk.path, mtime: chunk.mtime, content: cleanContent, // 使用已经清理过的内容 embedding: batchEmbeddings[j], metadata: chunk.metadata, } embeddedBatch.push(embeddedChunk) } }, { numOfAttempts: 3, // 减少重试次数 startingDelay: 500, // 减少延迟 timeMultiple: 1.5, jitter: 'full', }, ) // 立即插入当前批次,避免内存累积 if (embeddedBatch.length > 0) { await this.repository.insertVectors(embeddedBatch, embeddingModel) // 清理批次数据 embeddedBatch.length = 0 } embeddingProgress.completed += batchChunks.length updateProgress?.({ completedChunks: embeddingProgress.completed, totalChunks: contentChunks.length, totalFiles: filesToIndex.length, }) // 定期内存清理 await this.memoryCleanup(batchCount) } } else { // 不支持批量处理的提供商:使用流式处理逻辑 const limit = pLimit(32) // 从50降低到10,减少并发压力 const abortController = new AbortController() // 流式处理:分批处理并立即插入 for (let i = 0; i < contentChunks.length; i += batchSize) { if (abortController.signal.aborted) { throw new Error('Operation was aborted') } batchCount++ const batchChunks = contentChunks.slice(i, Math.min(i + batchSize, contentChunks.length)) const embeddedBatch: InsertVector[] = [] const tasks = batchChunks.map((chunk) => limit(async () => { if (abortController.signal.aborted) { throw new Error('Operation was aborted') } try { await backOff( async () => { // 在嵌入之前处理 markdown const cleanContent = removeMarkdown(chunk.content).replace(/\0/g, '') // 跳过清理后为空的内容 if (!cleanContent || cleanContent.trim().length === 0) { return } const embedding = await embeddingModel.getEmbedding(cleanContent) const embeddedChunk = { path: chunk.path, mtime: chunk.mtime, content: cleanContent, // 使用清理后的内容 embedding, metadata: chunk.metadata, } embeddedBatch.push(embeddedChunk) }, { numOfAttempts: 3, // 减少重试次数 startingDelay: 500, // 减少延迟 timeMultiple: 1.5, jitter: 'full', }, ) } catch (error) { abortController.abort() throw error } }), ) await Promise.all(tasks) // 立即插入当前批次 if (embeddedBatch.length > 0) { await this.repository.insertVectors(embeddedBatch, embeddingModel) // 清理批次数据 embeddedBatch.length = 0 } embeddingProgress.completed += batchChunks.length updateProgress?.({ completedChunks: embeddingProgress.completed, totalChunks: contentChunks.length, totalFiles: filesToIndex.length, }) // 定期内存清理 await this.memoryCleanup(batchCount) } } } catch (error) { if ( error instanceof LLMAPIKeyNotSetException || error instanceof LLMAPIKeyInvalidException || error instanceof LLMBaseUrlNotSetException ) { openSettingsModalWithError(this.app, error.message) } else if (error instanceof LLMRateLimitExceededException) { new Notice(error.message) } else { console.error('Error embedding chunks:', error) throw error } } finally { // 最终清理 this.forceGarbageCollection() } } async UpdateFileVectorIndex( embeddingModel: EmbeddingModel, chunkSize: number, batchSize: number, file: TFile ) { try { // Delete existing vectors for the files await this.repository.deleteVectorsForSingleFile( file.path, embeddingModel, ) // Embed the files const overlap = Math.floor(chunkSize * 0.15) const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: chunkSize, chunkOverlap: overlap, separators: [ "\n\n", "\n", ".", ",", " ", "\u200b", // Zero-width space "\uff0c", // Fullwidth comma "\u3001", // Ideographic comma "\uff0e", // Fullwidth full stop "\u3002", // Ideographic full stop "", ], }); let fileContent = await this.app.vault.cachedRead(file) // 清理null字节,防止PostgreSQL UTF8编码错误 fileContent = fileContent.replace(/\0/g, '') const fileDocuments = await textSplitter.createDocuments([ fileContent, ]) const contentChunks: InsertVector[] = fileDocuments .map((chunk): InsertVector | null => { // 保存原始内容,不在此处调用 removeMarkdown const rawContent = String(chunk.pageContent || '').replace(/\0/g, '') if (!rawContent || rawContent.trim().length === 0) { return null } return { path: file.path, mtime: file.stat.mtime, content: rawContent, // 保存原始内容 embedding: [], metadata: { startLine: Number(chunk.metadata.loc.lines.from), endLine: Number(chunk.metadata.loc.lines.to), }, } }) .filter((chunk): chunk is InsertVector => chunk !== null) let batchCount = 0 try { if (embeddingModel.supportsBatch) { // 支持批量处理的提供商:使用流式处理逻辑 for (let i = 0; i < contentChunks.length; i += batchSize) { batchCount++ console.log(`Embedding batch ${batchCount} of ${Math.ceil(contentChunks.length / batchSize)}`) const batchChunks = contentChunks.slice(i, Math.min(i + batchSize, contentChunks.length)) const embeddedBatch: InsertVector[] = [] await backOff( async () => { // 在嵌入之前处理 markdown,只处理一次 const cleanedBatchData = batchChunks.map(chunk => { const cleanContent = removeMarkdown(chunk.content).replace(/\0/g, '') return { chunk, cleanContent } }).filter(({ cleanContent }) => cleanContent && cleanContent.trim().length > 0) if (cleanedBatchData.length === 0) { return } const batchTexts = cleanedBatchData.map(({ cleanContent }) => cleanContent) const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts) // 合并embedding结果到chunk数据 for (let j = 0; j < cleanedBatchData.length; j++) { const { chunk, cleanContent } = cleanedBatchData[j] const embeddedChunk: InsertVector = { path: chunk.path, mtime: chunk.mtime, content: cleanContent, // 使用已经清理过的内容 embedding: batchEmbeddings[j], metadata: chunk.metadata, } embeddedBatch.push(embeddedChunk) } }, { numOfAttempts: 3, // 减少重试次数 startingDelay: 500, // 减少延迟 timeMultiple: 1.5, jitter: 'full', }, ) // 立即插入当前批次 if (embeddedBatch.length > 0) { await this.repository.insertVectors(embeddedBatch, embeddingModel) // 清理批次数据 embeddedBatch.length = 0 } // 定期内存清理 await this.memoryCleanup(batchCount) } } else { // 不支持批量处理的提供商:使用流式处理逻辑 const limit = pLimit(10) // 从50降低到10 const abortController = new AbortController() // 流式处理:分批处理并立即插入 for (let i = 0; i < contentChunks.length; i += batchSize) { if (abortController.signal.aborted) { throw new Error('Operation was aborted') } batchCount++ const batchChunks = contentChunks.slice(i, Math.min(i + batchSize, contentChunks.length)) const embeddedBatch: InsertVector[] = [] const tasks = batchChunks.map((chunk) => limit(async () => { if (abortController.signal.aborted) { throw new Error('Operation was aborted') } try { await backOff( async () => { // 在嵌入之前处理 markdown const cleanContent = removeMarkdown(chunk.content).replace(/\0/g, '') // 跳过清理后为空的内容 if (!cleanContent || cleanContent.trim().length === 0) { return } const embedding = await embeddingModel.getEmbedding(cleanContent) const embeddedChunk = { path: chunk.path, mtime: chunk.mtime, content: cleanContent, // 使用清理后的内容 embedding, metadata: chunk.metadata, } embeddedBatch.push(embeddedChunk) }, { numOfAttempts: 3, // 减少重试次数 startingDelay: 500, // 减少延迟 timeMultiple: 1.5, jitter: 'full', }, ) } catch (error) { abortController.abort() throw error } }), ) await Promise.all(tasks) // 立即插入当前批次 if (embeddedBatch.length > 0) { await this.repository.insertVectors(embeddedBatch, embeddingModel) // 清理批次数据 embeddedBatch.length = 0 } // 定期内存清理 await this.memoryCleanup(batchCount) } } } catch (error) { console.error('Error embedding chunks:', error) } finally { // 最终清理 this.forceGarbageCollection() } } catch (error) { console.warn(`跳过文件 ${file.path}:`, error.message) new Notice(`跳过文件 ${file.name}: ${error.message}`) } } async DeleteFileVectorIndex( embeddingModel: EmbeddingModel, file: TFile ) { await this.repository.deleteVectorsForSingleFile(file.path, embeddingModel) } private async cleanVectorsForDeletedFiles( embeddingModel: EmbeddingModel, ) { const indexedFilePaths = await this.repository.getAllIndexedFilePaths(embeddingModel) const needToDelete = indexedFilePaths.filter(filePath => !this.app.vault.getAbstractFileByPath(filePath)) if (needToDelete.length > 0) { await this.repository.deleteVectorsForMultipleFiles( needToDelete, embeddingModel, ) } } private async getFilesToIndex({ embeddingModel, excludePatterns, includePatterns, reindexAll, }: { embeddingModel: EmbeddingModel excludePatterns: string[] includePatterns: string[] reindexAll?: boolean }): Promise { let filesToIndex = this.app.vault.getMarkdownFiles() console.log("get all vault files: ", filesToIndex.length) filesToIndex = filesToIndex.filter((file) => { return !excludePatterns.some((pattern) => minimatch(file.path, pattern)) }) if (includePatterns.length > 0) { filesToIndex = filesToIndex.filter((file) => { return includePatterns.some((pattern) => minimatch(file.path, pattern)) }) } if (reindexAll) { return filesToIndex } // 优化流程:使用数据库最大mtime来过滤需要更新的文件 try { const maxMtime = await this.repository.getMaxMtime(embeddingModel) console.log("Database max mtime:", maxMtime) if (maxMtime === null) { // 数据库中没有任何向量,需要索引所有文件 return filesToIndex } // 筛选出在数据库最后更新时间之后修改的文件 return filesToIndex.filter((file) => { return file.stat.mtime > maxMtime }) } catch (error) { console.error("Error getting max mtime from database:", error) return [] } } private async getFilesToIndexInWorkspace({ embeddingModel, workspace, excludePatterns, includePatterns, reindexAll, }: { embeddingModel: EmbeddingModel workspace: Workspace excludePatterns: string[] includePatterns: string[] reindexAll?: boolean }): Promise { // 获取工作区中的所有文件 const workspaceFiles = new Set() if (workspace) { // 处理工作区中的文件夹和标签 for (const item of workspace.content) { if (item.type === 'folder') { const folderPath = item.content // 获取文件夹下的所有文件 const files = this.app.vault.getMarkdownFiles().filter(file => file.path.startsWith(folderPath === '/' ? '' : folderPath + '/') ) // 添加所有文件路径 files.forEach(file => { workspaceFiles.add(file.path) }) } else if (item.type === 'tag') { // 获取标签对应的所有文件 const tagFiles = getFilesWithTag(item.content, this.app) tagFiles.forEach(filePath => { workspaceFiles.add(filePath) }) } } } // 将路径转换为 TFile 对象 let filesToIndex = Array.from(workspaceFiles) .map(path => this.app.vault.getFileByPath(path)) .filter((file): file is TFile => file !== null && file instanceof TFile) console.log("get workspace files: ", filesToIndex.length) // 应用排除和包含模式 filesToIndex = filesToIndex.filter((file) => { return !excludePatterns.some((pattern) => minimatch(file.path, pattern)) }) if (includePatterns.length > 0) { filesToIndex = filesToIndex.filter((file) => { return includePatterns.some((pattern) => minimatch(file.path, pattern)) }) } if (reindexAll) { return filesToIndex } // 优化流程:使用数据库最大mtime来过滤需要更新的文件 try { const maxMtime = await this.repository.getMaxMtime(embeddingModel) console.log("Database max mtime:", maxMtime) if (maxMtime === null) { // 数据库中没有任何向量,需要索引所有文件 return filesToIndex } // 筛选出在数据库最后更新时间之后修改的文件 return filesToIndex.filter((file) => { return file.stat.mtime > maxMtime }) } catch (error) { console.error("Error getting max mtime from database:", error) return [] } } }