mirror of
https://github.com/EthanMarti/infio-copilot.git
synced 2026-01-16 08:21:55 +00:00
1033 lines
30 KiB
TypeScript
1033 lines
30 KiB
TypeScript
import { backOff } from 'exponential-backoff';
|
||
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
|
||
import { minimatch } from 'minimatch';
|
||
import { App, Notice, TFile } from 'obsidian';
|
||
import pLimit from 'p-limit';
|
||
import removeMarkdown from 'remove-markdown';
|
||
|
||
import { IndexProgress } from '../../../components/chat-view/QueryProgress';
|
||
import {
|
||
LLMAPIKeyInvalidException,
|
||
LLMAPIKeyNotSetException,
|
||
LLMBaseUrlNotSetException,
|
||
LLMRateLimitExceededException,
|
||
} from '../../../core/llm/exception';
|
||
import { InsertVector, SelectVector } from '../../../database/schema';
|
||
import { EmbeddingModel } from '../../../types/embedding';
|
||
import { getFilesWithTag } from '../../../utils/glob-utils';
|
||
import { openSettingsModalWithError } from '../../../utils/open-settings-modal';
|
||
import { DBManager } from '../../database-manager';
|
||
import { Workspace } from '../../json/workspace/types';
|
||
|
||
import { VectorRepository } from './vector-repository';
|
||
|
||
export class VectorManager {
|
||
private app: App
|
||
private repository: VectorRepository
|
||
private dbManager: DBManager
|
||
|
||
constructor(app: App, dbManager: DBManager) {
|
||
this.app = app
|
||
this.dbManager = dbManager
|
||
this.repository = new VectorRepository(app, dbManager.getPgClient())
|
||
}
|
||
|
||
async performSimilaritySearch(
|
||
queryVector: number[],
|
||
embeddingModel: EmbeddingModel,
|
||
options: {
|
||
minSimilarity: number
|
||
limit: number
|
||
scope?: {
|
||
files: string[]
|
||
folders: string[]
|
||
}
|
||
},
|
||
): Promise<
|
||
(Omit<SelectVector, 'embedding'> & {
|
||
similarity: number
|
||
})[]
|
||
> {
|
||
return await this.repository.performSimilaritySearch(
|
||
queryVector,
|
||
embeddingModel,
|
||
options,
|
||
)
|
||
}
|
||
|
||
async getWorkspaceStatistics(
|
||
embeddingModel: EmbeddingModel,
|
||
workspace?: Workspace
|
||
): Promise<{
|
||
totalFiles: number
|
||
totalChunks: number
|
||
}> {
|
||
// 构建工作区范围
|
||
let scope: { files: string[], folders: string[] } | undefined
|
||
if (workspace) {
|
||
const folders: string[] = []
|
||
const files: string[] = []
|
||
|
||
// 处理工作区中的文件夹和标签
|
||
for (const item of workspace.content) {
|
||
if (item.type === 'folder') {
|
||
folders.push(item.content)
|
||
} else if (item.type === 'tag') {
|
||
// 获取标签对应的所有文件
|
||
const tagFiles = getFilesWithTag(item.content, this.app)
|
||
files.push(...tagFiles)
|
||
}
|
||
}
|
||
|
||
// 只有当有文件夹或文件时才设置 scope
|
||
if (folders.length > 0 || files.length > 0) {
|
||
scope = { files, folders }
|
||
}
|
||
}
|
||
|
||
if (scope) {
|
||
return await this.repository.getWorkspaceStatistics(embeddingModel, scope)
|
||
} else {
|
||
return await this.repository.getVaultStatistics(embeddingModel)
|
||
}
|
||
}
|
||
|
||
async getVaultStatistics(embeddingModel: EmbeddingModel): Promise<{
|
||
totalFiles: number
|
||
totalChunks: number
|
||
}> {
|
||
return await this.repository.getVaultStatistics(embeddingModel)
|
||
}
|
||
|
||
// 强制垃圾回收的辅助方法
|
||
private forceGarbageCollection() {
|
||
try {
|
||
if (typeof global !== 'undefined' && global.gc) {
|
||
global.gc()
|
||
} else if (typeof window !== 'undefined' && (window as any).gc) {
|
||
((window as any).gc as () => void)();
|
||
}
|
||
} catch (e) {
|
||
// 忽略垃圾回收错误
|
||
}
|
||
}
|
||
|
||
// 检查并清理内存的辅助方法
|
||
private async memoryCleanup(batchCount: number) {
|
||
// 每10批次强制垃圾回收
|
||
if (batchCount % 10 === 0) {
|
||
this.forceGarbageCollection()
|
||
// 短暂延迟让内存清理完成
|
||
await new Promise(resolve => setTimeout(resolve, 100))
|
||
}
|
||
}
|
||
|
||
async updateVaultIndex(
|
||
embeddingModel: EmbeddingModel,
|
||
options: {
|
||
chunkSize: number
|
||
batchSize: number
|
||
excludePatterns: string[]
|
||
includePatterns: string[]
|
||
reindexAll?: boolean
|
||
},
|
||
updateProgress?: (indexProgress: IndexProgress) => void,
|
||
): Promise<void> {
|
||
let filesToIndex: TFile[]
|
||
if (options.reindexAll) {
|
||
console.log("updateVaultIndex reindexAll")
|
||
filesToIndex = await this.getFilesToIndex({
|
||
embeddingModel: embeddingModel,
|
||
excludePatterns: options.excludePatterns,
|
||
includePatterns: options.includePatterns,
|
||
reindexAll: true,
|
||
})
|
||
await this.repository.clearAllVectors(embeddingModel)
|
||
} else {
|
||
console.log("updateVaultIndex for update files")
|
||
await this.cleanVectorsForDeletedFiles(embeddingModel)
|
||
console.log("updateVaultIndex cleanVectorsForDeletedFiles")
|
||
filesToIndex = await this.getFilesToIndex({
|
||
embeddingModel: embeddingModel,
|
||
excludePatterns: options.excludePatterns,
|
||
includePatterns: options.includePatterns,
|
||
})
|
||
console.log("get files to index: ", filesToIndex.length)
|
||
await this.repository.deleteVectorsForMultipleFiles(
|
||
filesToIndex.map((file) => file.path),
|
||
embeddingModel,
|
||
)
|
||
console.log("delete vectors for multiple files: ", filesToIndex.length)
|
||
}
|
||
console.log("get files to index: ", filesToIndex.length)
|
||
|
||
if (filesToIndex.length === 0) {
|
||
return
|
||
}
|
||
|
||
// Embed the files
|
||
const overlap = Math.floor(options.chunkSize * 0.15)
|
||
const textSplitter = new RecursiveCharacterTextSplitter({
|
||
chunkSize: options.chunkSize,
|
||
chunkOverlap: overlap,
|
||
separators: [
|
||
"\n\n",
|
||
"\n",
|
||
".",
|
||
",",
|
||
" ",
|
||
"\u200b", // Zero-width space
|
||
"\uff0c", // Fullwidth comma
|
||
"\u3001", // Ideographic comma
|
||
"\uff0e", // Fullwidth full stop
|
||
"\u3002", // Ideographic full stop
|
||
"",
|
||
],
|
||
});
|
||
console.log("textSplitter chunkSize: ", options.chunkSize, "overlap: ", overlap)
|
||
|
||
const skippedFiles: string[] = []
|
||
const contentChunks: InsertVector[] = (
|
||
await Promise.all(
|
||
filesToIndex.map(async (file) => {
|
||
try {
|
||
let fileContent = await this.app.vault.cachedRead(file)
|
||
// 清理null字节,防止PostgreSQL UTF8编码错误
|
||
fileContent = fileContent.replace(/\0/g, '')
|
||
const fileDocuments = await textSplitter.createDocuments([
|
||
fileContent,
|
||
])
|
||
return fileDocuments
|
||
.map((chunk): InsertVector | null => {
|
||
// 保存原始内容,不在此处调用 removeMarkdown
|
||
const rawContent = chunk.pageContent.replace(/\0/g, '')
|
||
if (!rawContent || rawContent.trim().length === 0) {
|
||
return null
|
||
}
|
||
return {
|
||
path: file.path,
|
||
mtime: file.stat.mtime,
|
||
content: rawContent, // 保存原始内容
|
||
embedding: [],
|
||
metadata: {
|
||
startLine: Number(chunk.metadata.loc.lines.from),
|
||
endLine: Number(chunk.metadata.loc.lines.to),
|
||
},
|
||
}
|
||
})
|
||
.filter((chunk): chunk is InsertVector => chunk !== null)
|
||
} catch (error) {
|
||
console.warn(`跳过文件 ${file.path}:`, error.message)
|
||
skippedFiles.push(file.path)
|
||
return []
|
||
}
|
||
}),
|
||
)
|
||
).flat()
|
||
|
||
console.log("contentChunks: ", contentChunks.length)
|
||
|
||
if (skippedFiles.length > 0) {
|
||
console.warn(`跳过了 ${skippedFiles.length} 个有问题的文件:`, skippedFiles)
|
||
new Notice(`跳过了 ${skippedFiles.length} 个有问题的文件`)
|
||
}
|
||
|
||
updateProgress?.({
|
||
completedChunks: 0,
|
||
totalChunks: contentChunks.length,
|
||
totalFiles: filesToIndex.length,
|
||
})
|
||
|
||
const embeddingProgress = { completed: 0 }
|
||
// 减少批量大小以降低内存压力
|
||
const batchSize = options.batchSize
|
||
let batchCount = 0
|
||
|
||
try {
|
||
if (embeddingModel.supportsBatch) {
|
||
// 支持批量处理的提供商:使用流式处理逻辑
|
||
for (let i = 0; i < contentChunks.length; i += batchSize) {
|
||
batchCount++
|
||
const batchChunks = contentChunks.slice(i, Math.min(i + batchSize, contentChunks.length))
|
||
|
||
const embeddedBatch: InsertVector[] = []
|
||
|
||
await backOff(
|
||
async () => {
|
||
// 在嵌入之前处理 markdown
|
||
const cleanedBatchData = batchChunks.map(chunk => {
|
||
const cleanContent = removeMarkdown(chunk.content)
|
||
return { chunk, cleanContent }
|
||
}).filter(({ cleanContent }) => cleanContent && cleanContent.trim().length > 0)
|
||
|
||
if (cleanedBatchData.length === 0) {
|
||
return
|
||
}
|
||
|
||
const batchTexts = cleanedBatchData.map(({ cleanContent }) => cleanContent)
|
||
const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts)
|
||
|
||
// 合并embedding结果到chunk数据
|
||
for (let j = 0; j < cleanedBatchData.length; j++) {
|
||
const { chunk, cleanContent } = cleanedBatchData[j]
|
||
const embeddedChunk: InsertVector = {
|
||
path: chunk.path,
|
||
mtime: chunk.mtime,
|
||
content: cleanContent, // 使用已经清理过的内容
|
||
embedding: batchEmbeddings[j],
|
||
metadata: chunk.metadata,
|
||
}
|
||
embeddedBatch.push(embeddedChunk)
|
||
}
|
||
},
|
||
{
|
||
numOfAttempts: 3, // 减少重试次数
|
||
startingDelay: 500, // 减少延迟
|
||
timeMultiple: 1.5,
|
||
jitter: 'full',
|
||
},
|
||
)
|
||
|
||
// 立即插入当前批次,避免内存累积
|
||
if (embeddedBatch.length > 0) {
|
||
await this.repository.insertVectors(embeddedBatch, embeddingModel)
|
||
// 清理批次数据
|
||
embeddedBatch.length = 0
|
||
}
|
||
|
||
embeddingProgress.completed += batchChunks.length
|
||
updateProgress?.({
|
||
completedChunks: embeddingProgress.completed,
|
||
totalChunks: contentChunks.length,
|
||
totalFiles: filesToIndex.length,
|
||
})
|
||
|
||
// 定期内存清理
|
||
await this.memoryCleanup(batchCount)
|
||
}
|
||
} else {
|
||
// 不支持批量处理的提供商:使用流式处理逻辑
|
||
const limit = pLimit(32) // 从50降低到10,减少并发压力
|
||
const abortController = new AbortController()
|
||
|
||
// 流式处理:分批处理并立即插入
|
||
for (let i = 0; i < contentChunks.length; i += batchSize) {
|
||
if (abortController.signal.aborted) {
|
||
throw new Error('Operation was aborted')
|
||
}
|
||
|
||
batchCount++
|
||
const batchChunks = contentChunks.slice(i, Math.min(i + batchSize, contentChunks.length))
|
||
const embeddedBatch: InsertVector[] = []
|
||
|
||
const tasks = batchChunks.map((chunk) =>
|
||
limit(async () => {
|
||
if (abortController.signal.aborted) {
|
||
throw new Error('Operation was aborted')
|
||
}
|
||
try {
|
||
await backOff(
|
||
async () => {
|
||
// 在嵌入之前处理 markdown
|
||
const cleanContent = removeMarkdown(chunk.content).replace(/\0/g, '')
|
||
// 跳过清理后为空的内容
|
||
if (!cleanContent || cleanContent.trim().length === 0) {
|
||
return
|
||
}
|
||
|
||
const embedding = await embeddingModel.getEmbedding(cleanContent)
|
||
const embeddedChunk = {
|
||
path: chunk.path,
|
||
mtime: chunk.mtime,
|
||
content: cleanContent, // 使用清理后的内容
|
||
embedding,
|
||
metadata: chunk.metadata,
|
||
}
|
||
embeddedBatch.push(embeddedChunk)
|
||
},
|
||
{
|
||
numOfAttempts: 3, // 减少重试次数
|
||
startingDelay: 500, // 减少延迟
|
||
timeMultiple: 1.5,
|
||
jitter: 'full',
|
||
},
|
||
)
|
||
} catch (error) {
|
||
abortController.abort()
|
||
throw error
|
||
}
|
||
}),
|
||
)
|
||
|
||
await Promise.all(tasks)
|
||
|
||
// 立即插入当前批次
|
||
if (embeddedBatch.length > 0) {
|
||
await this.repository.insertVectors(embeddedBatch, embeddingModel)
|
||
// 清理批次数据
|
||
embeddedBatch.length = 0
|
||
}
|
||
|
||
embeddingProgress.completed += batchChunks.length
|
||
updateProgress?.({
|
||
completedChunks: embeddingProgress.completed,
|
||
totalChunks: contentChunks.length,
|
||
totalFiles: filesToIndex.length,
|
||
})
|
||
|
||
// 定期内存清理
|
||
await this.memoryCleanup(batchCount)
|
||
}
|
||
}
|
||
} catch (error) {
|
||
if (
|
||
error instanceof LLMAPIKeyNotSetException ||
|
||
error instanceof LLMAPIKeyInvalidException ||
|
||
error instanceof LLMBaseUrlNotSetException
|
||
) {
|
||
openSettingsModalWithError(this.app, error.message)
|
||
} else if (error instanceof LLMRateLimitExceededException) {
|
||
new Notice(error.message)
|
||
} else {
|
||
console.error('Error embedding chunks:', error)
|
||
throw error
|
||
}
|
||
} finally {
|
||
// 最终清理
|
||
this.forceGarbageCollection()
|
||
}
|
||
}
|
||
|
||
async updateWorkspaceIndex(
|
||
embeddingModel: EmbeddingModel,
|
||
workspace: Workspace,
|
||
options: {
|
||
chunkSize: number
|
||
batchSize: number
|
||
excludePatterns: string[]
|
||
includePatterns: string[]
|
||
reindexAll?: boolean
|
||
},
|
||
updateProgress?: (indexProgress: IndexProgress) => void,
|
||
): Promise<void> {
|
||
let filesToIndex: TFile[]
|
||
if (options.reindexAll) {
|
||
console.log("updateWorkspaceIndex reindexAll")
|
||
filesToIndex = await this.getFilesToIndexInWorkspace({
|
||
embeddingModel: embeddingModel,
|
||
workspace: workspace,
|
||
excludePatterns: options.excludePatterns,
|
||
includePatterns: options.includePatterns,
|
||
reindexAll: true,
|
||
})
|
||
// 只清理工作区相关的向量,而不是全部
|
||
const workspaceFilePaths = filesToIndex.map((file) => file.path)
|
||
if (workspaceFilePaths.length > 0) {
|
||
await this.repository.deleteVectorsForMultipleFiles(workspaceFilePaths, embeddingModel)
|
||
}
|
||
} else {
|
||
console.log("updateWorkspaceIndex for update files")
|
||
await this.cleanVectorsForDeletedFiles(embeddingModel)
|
||
console.log("updateWorkspaceIndex cleanVectorsForDeletedFiles")
|
||
filesToIndex = await this.getFilesToIndexInWorkspace({
|
||
embeddingModel: embeddingModel,
|
||
workspace: workspace,
|
||
excludePatterns: options.excludePatterns,
|
||
includePatterns: options.includePatterns,
|
||
})
|
||
console.log("get workspace files to index: ", filesToIndex.length)
|
||
await this.repository.deleteVectorsForMultipleFiles(
|
||
filesToIndex.map((file) => file.path),
|
||
embeddingModel,
|
||
)
|
||
console.log("delete vectors for workspace files: ", filesToIndex.length)
|
||
}
|
||
console.log("get workspace files to index: ", filesToIndex.length)
|
||
|
||
if (filesToIndex.length === 0) {
|
||
return
|
||
}
|
||
|
||
// Embed the files (使用与 updateVaultIndex 相同的逻辑)
|
||
const overlap = Math.floor(options.chunkSize * 0.15)
|
||
const textSplitter = new RecursiveCharacterTextSplitter({
|
||
chunkSize: options.chunkSize,
|
||
chunkOverlap: overlap,
|
||
separators: [
|
||
"\n\n",
|
||
"\n",
|
||
".",
|
||
",",
|
||
" ",
|
||
"\u200b", // Zero-width space
|
||
"\uff0c", // Fullwidth comma
|
||
"\u3001", // Ideographic comma
|
||
"\uff0e", // Fullwidth full stop
|
||
"\u3002", // Ideographic full stop
|
||
"",
|
||
],
|
||
});
|
||
console.log("textSplitter chunkSize: ", options.chunkSize, "overlap: ", overlap)
|
||
|
||
const skippedFiles: string[] = []
|
||
const contentChunks: InsertVector[] = (
|
||
await Promise.all(
|
||
filesToIndex.map(async (file) => {
|
||
try {
|
||
let fileContent = await this.app.vault.cachedRead(file)
|
||
// 清理null字节,防止PostgreSQL UTF8编码错误
|
||
fileContent = fileContent.replace(/\0/g, '')
|
||
const fileDocuments = await textSplitter.createDocuments([
|
||
fileContent,
|
||
])
|
||
return fileDocuments
|
||
.map((chunk): InsertVector | null => {
|
||
// 保存原始内容,不在此处调用 removeMarkdown
|
||
const rawContent = chunk.pageContent.replace(/\0/g, '')
|
||
if (!rawContent || rawContent.trim().length === 0) {
|
||
return null
|
||
}
|
||
return {
|
||
path: file.path,
|
||
mtime: file.stat.mtime,
|
||
content: rawContent, // 保存原始内容
|
||
embedding: [],
|
||
metadata: {
|
||
startLine: Number(chunk.metadata.loc.lines.from),
|
||
endLine: Number(chunk.metadata.loc.lines.to),
|
||
},
|
||
}
|
||
})
|
||
.filter((chunk): chunk is InsertVector => chunk !== null)
|
||
} catch (error) {
|
||
console.warn(`跳过文件 ${file.path}:`, error.message)
|
||
skippedFiles.push(file.path)
|
||
return []
|
||
}
|
||
}),
|
||
)
|
||
).flat()
|
||
|
||
console.log("contentChunks: ", contentChunks.length)
|
||
|
||
if (skippedFiles.length > 0) {
|
||
console.warn(`跳过了 ${skippedFiles.length} 个有问题的文件:`, skippedFiles)
|
||
new Notice(`跳过了 ${skippedFiles.length} 个有问题的文件`)
|
||
}
|
||
|
||
updateProgress?.({
|
||
completedChunks: 0,
|
||
totalChunks: contentChunks.length,
|
||
totalFiles: filesToIndex.length,
|
||
})
|
||
|
||
const embeddingProgress = { completed: 0 }
|
||
// 减少批量大小以降低内存压力
|
||
const batchSize = options.batchSize
|
||
let batchCount = 0
|
||
|
||
try {
|
||
if (embeddingModel.supportsBatch) {
|
||
// 支持批量处理的提供商:使用流式处理逻辑
|
||
for (let i = 0; i < contentChunks.length; i += batchSize) {
|
||
batchCount++
|
||
const batchChunks = contentChunks.slice(i, Math.min(i + batchSize, contentChunks.length))
|
||
|
||
const embeddedBatch: InsertVector[] = []
|
||
|
||
await backOff(
|
||
async () => {
|
||
// 在嵌入之前处理 markdown,只处理一次
|
||
const cleanedBatchData = batchChunks.map(chunk => {
|
||
const cleanContent = removeMarkdown(chunk.content)
|
||
return { chunk, cleanContent }
|
||
}).filter(({ cleanContent }) => cleanContent && cleanContent.trim().length > 0)
|
||
|
||
if (cleanedBatchData.length === 0) {
|
||
return
|
||
}
|
||
|
||
const batchTexts = cleanedBatchData.map(({ cleanContent }) => cleanContent)
|
||
const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts)
|
||
|
||
// 合并embedding结果到chunk数据
|
||
for (let j = 0; j < cleanedBatchData.length; j++) {
|
||
const { chunk, cleanContent } = cleanedBatchData[j]
|
||
const embeddedChunk: InsertVector = {
|
||
path: chunk.path,
|
||
mtime: chunk.mtime,
|
||
content: cleanContent, // 使用已经清理过的内容
|
||
embedding: batchEmbeddings[j],
|
||
metadata: chunk.metadata,
|
||
}
|
||
embeddedBatch.push(embeddedChunk)
|
||
}
|
||
},
|
||
{
|
||
numOfAttempts: 3, // 减少重试次数
|
||
startingDelay: 500, // 减少延迟
|
||
timeMultiple: 1.5,
|
||
jitter: 'full',
|
||
},
|
||
)
|
||
|
||
// 立即插入当前批次,避免内存累积
|
||
if (embeddedBatch.length > 0) {
|
||
await this.repository.insertVectors(embeddedBatch, embeddingModel)
|
||
// 清理批次数据
|
||
embeddedBatch.length = 0
|
||
}
|
||
|
||
embeddingProgress.completed += batchChunks.length
|
||
updateProgress?.({
|
||
completedChunks: embeddingProgress.completed,
|
||
totalChunks: contentChunks.length,
|
||
totalFiles: filesToIndex.length,
|
||
})
|
||
|
||
// 定期内存清理
|
||
await this.memoryCleanup(batchCount)
|
||
}
|
||
} else {
|
||
// 不支持批量处理的提供商:使用流式处理逻辑
|
||
const limit = pLimit(32) // 从50降低到10,减少并发压力
|
||
const abortController = new AbortController()
|
||
|
||
// 流式处理:分批处理并立即插入
|
||
for (let i = 0; i < contentChunks.length; i += batchSize) {
|
||
if (abortController.signal.aborted) {
|
||
throw new Error('Operation was aborted')
|
||
}
|
||
|
||
batchCount++
|
||
const batchChunks = contentChunks.slice(i, Math.min(i + batchSize, contentChunks.length))
|
||
const embeddedBatch: InsertVector[] = []
|
||
|
||
const tasks = batchChunks.map((chunk) =>
|
||
limit(async () => {
|
||
if (abortController.signal.aborted) {
|
||
throw new Error('Operation was aborted')
|
||
}
|
||
try {
|
||
await backOff(
|
||
async () => {
|
||
// 在嵌入之前处理 markdown
|
||
const cleanContent = removeMarkdown(chunk.content).replace(/\0/g, '')
|
||
// 跳过清理后为空的内容
|
||
if (!cleanContent || cleanContent.trim().length === 0) {
|
||
return
|
||
}
|
||
|
||
const embedding = await embeddingModel.getEmbedding(cleanContent)
|
||
const embeddedChunk = {
|
||
path: chunk.path,
|
||
mtime: chunk.mtime,
|
||
content: cleanContent, // 使用清理后的内容
|
||
embedding,
|
||
metadata: chunk.metadata,
|
||
}
|
||
embeddedBatch.push(embeddedChunk)
|
||
},
|
||
{
|
||
numOfAttempts: 3, // 减少重试次数
|
||
startingDelay: 500, // 减少延迟
|
||
timeMultiple: 1.5,
|
||
jitter: 'full',
|
||
},
|
||
)
|
||
} catch (error) {
|
||
abortController.abort()
|
||
throw error
|
||
}
|
||
}),
|
||
)
|
||
|
||
await Promise.all(tasks)
|
||
|
||
// 立即插入当前批次
|
||
if (embeddedBatch.length > 0) {
|
||
await this.repository.insertVectors(embeddedBatch, embeddingModel)
|
||
// 清理批次数据
|
||
embeddedBatch.length = 0
|
||
}
|
||
|
||
embeddingProgress.completed += batchChunks.length
|
||
updateProgress?.({
|
||
completedChunks: embeddingProgress.completed,
|
||
totalChunks: contentChunks.length,
|
||
totalFiles: filesToIndex.length,
|
||
})
|
||
|
||
// 定期内存清理
|
||
await this.memoryCleanup(batchCount)
|
||
}
|
||
}
|
||
} catch (error) {
|
||
if (
|
||
error instanceof LLMAPIKeyNotSetException ||
|
||
error instanceof LLMAPIKeyInvalidException ||
|
||
error instanceof LLMBaseUrlNotSetException
|
||
) {
|
||
openSettingsModalWithError(this.app, error.message)
|
||
} else if (error instanceof LLMRateLimitExceededException) {
|
||
new Notice(error.message)
|
||
} else {
|
||
console.error('Error embedding chunks:', error)
|
||
throw error
|
||
}
|
||
} finally {
|
||
// 最终清理
|
||
this.forceGarbageCollection()
|
||
}
|
||
}
|
||
|
||
async UpdateFileVectorIndex(
|
||
embeddingModel: EmbeddingModel,
|
||
chunkSize: number,
|
||
batchSize: number,
|
||
file: TFile
|
||
) {
|
||
try {
|
||
// Delete existing vectors for the files
|
||
await this.repository.deleteVectorsForSingleFile(
|
||
file.path,
|
||
embeddingModel,
|
||
)
|
||
|
||
// Embed the files
|
||
const overlap = Math.floor(chunkSize * 0.15)
|
||
const textSplitter = new RecursiveCharacterTextSplitter({
|
||
chunkSize: chunkSize,
|
||
chunkOverlap: overlap,
|
||
separators: [
|
||
"\n\n",
|
||
"\n",
|
||
".",
|
||
",",
|
||
" ",
|
||
"\u200b", // Zero-width space
|
||
"\uff0c", // Fullwidth comma
|
||
"\u3001", // Ideographic comma
|
||
"\uff0e", // Fullwidth full stop
|
||
"\u3002", // Ideographic full stop
|
||
"",
|
||
],
|
||
});
|
||
let fileContent = await this.app.vault.cachedRead(file)
|
||
// 清理null字节,防止PostgreSQL UTF8编码错误
|
||
fileContent = fileContent.replace(/\0/g, '')
|
||
const fileDocuments = await textSplitter.createDocuments([
|
||
fileContent,
|
||
])
|
||
|
||
const contentChunks: InsertVector[] = fileDocuments
|
||
.map((chunk): InsertVector | null => {
|
||
// 保存原始内容,不在此处调用 removeMarkdown
|
||
const rawContent = String(chunk.pageContent || '').replace(/\0/g, '')
|
||
if (!rawContent || rawContent.trim().length === 0) {
|
||
return null
|
||
}
|
||
return {
|
||
path: file.path,
|
||
mtime: file.stat.mtime,
|
||
content: rawContent, // 保存原始内容
|
||
embedding: [],
|
||
metadata: {
|
||
startLine: Number(chunk.metadata.loc.lines.from),
|
||
endLine: Number(chunk.metadata.loc.lines.to),
|
||
},
|
||
}
|
||
})
|
||
.filter((chunk): chunk is InsertVector => chunk !== null)
|
||
|
||
let batchCount = 0
|
||
|
||
try {
|
||
if (embeddingModel.supportsBatch) {
|
||
// 支持批量处理的提供商:使用流式处理逻辑
|
||
for (let i = 0; i < contentChunks.length; i += batchSize) {
|
||
batchCount++
|
||
console.log(`Embedding batch ${batchCount} of ${Math.ceil(contentChunks.length / batchSize)}`)
|
||
const batchChunks = contentChunks.slice(i, Math.min(i + batchSize, contentChunks.length))
|
||
|
||
const embeddedBatch: InsertVector[] = []
|
||
|
||
await backOff(
|
||
async () => {
|
||
// 在嵌入之前处理 markdown,只处理一次
|
||
const cleanedBatchData = batchChunks.map(chunk => {
|
||
const cleanContent = removeMarkdown(chunk.content).replace(/\0/g, '')
|
||
return { chunk, cleanContent }
|
||
}).filter(({ cleanContent }) => cleanContent && cleanContent.trim().length > 0)
|
||
|
||
if (cleanedBatchData.length === 0) {
|
||
return
|
||
}
|
||
|
||
const batchTexts = cleanedBatchData.map(({ cleanContent }) => cleanContent)
|
||
const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts)
|
||
|
||
// 合并embedding结果到chunk数据
|
||
for (let j = 0; j < cleanedBatchData.length; j++) {
|
||
const { chunk, cleanContent } = cleanedBatchData[j]
|
||
const embeddedChunk: InsertVector = {
|
||
path: chunk.path,
|
||
mtime: chunk.mtime,
|
||
content: cleanContent, // 使用已经清理过的内容
|
||
embedding: batchEmbeddings[j],
|
||
metadata: chunk.metadata,
|
||
}
|
||
embeddedBatch.push(embeddedChunk)
|
||
}
|
||
},
|
||
{
|
||
numOfAttempts: 3, // 减少重试次数
|
||
startingDelay: 500, // 减少延迟
|
||
timeMultiple: 1.5,
|
||
jitter: 'full',
|
||
},
|
||
)
|
||
|
||
// 立即插入当前批次
|
||
if (embeddedBatch.length > 0) {
|
||
await this.repository.insertVectors(embeddedBatch, embeddingModel)
|
||
// 清理批次数据
|
||
embeddedBatch.length = 0
|
||
}
|
||
|
||
// 定期内存清理
|
||
await this.memoryCleanup(batchCount)
|
||
}
|
||
} else {
|
||
// 不支持批量处理的提供商:使用流式处理逻辑
|
||
const limit = pLimit(10) // 从50降低到10
|
||
const abortController = new AbortController()
|
||
|
||
// 流式处理:分批处理并立即插入
|
||
for (let i = 0; i < contentChunks.length; i += batchSize) {
|
||
if (abortController.signal.aborted) {
|
||
throw new Error('Operation was aborted')
|
||
}
|
||
|
||
batchCount++
|
||
const batchChunks = contentChunks.slice(i, Math.min(i + batchSize, contentChunks.length))
|
||
const embeddedBatch: InsertVector[] = []
|
||
|
||
const tasks = batchChunks.map((chunk) =>
|
||
limit(async () => {
|
||
if (abortController.signal.aborted) {
|
||
throw new Error('Operation was aborted')
|
||
}
|
||
try {
|
||
await backOff(
|
||
async () => {
|
||
// 在嵌入之前处理 markdown
|
||
const cleanContent = removeMarkdown(chunk.content).replace(/\0/g, '')
|
||
// 跳过清理后为空的内容
|
||
if (!cleanContent || cleanContent.trim().length === 0) {
|
||
return
|
||
}
|
||
|
||
const embedding = await embeddingModel.getEmbedding(cleanContent)
|
||
const embeddedChunk = {
|
||
path: chunk.path,
|
||
mtime: chunk.mtime,
|
||
content: cleanContent, // 使用清理后的内容
|
||
embedding,
|
||
metadata: chunk.metadata,
|
||
}
|
||
embeddedBatch.push(embeddedChunk)
|
||
},
|
||
{
|
||
numOfAttempts: 3, // 减少重试次数
|
||
startingDelay: 500, // 减少延迟
|
||
timeMultiple: 1.5,
|
||
jitter: 'full',
|
||
},
|
||
)
|
||
} catch (error) {
|
||
abortController.abort()
|
||
throw error
|
||
}
|
||
}),
|
||
)
|
||
|
||
await Promise.all(tasks)
|
||
|
||
// 立即插入当前批次
|
||
if (embeddedBatch.length > 0) {
|
||
await this.repository.insertVectors(embeddedBatch, embeddingModel)
|
||
// 清理批次数据
|
||
embeddedBatch.length = 0
|
||
}
|
||
|
||
// 定期内存清理
|
||
await this.memoryCleanup(batchCount)
|
||
}
|
||
}
|
||
} catch (error) {
|
||
console.error('Error embedding chunks:', error)
|
||
} finally {
|
||
// 最终清理
|
||
this.forceGarbageCollection()
|
||
}
|
||
} catch (error) {
|
||
console.warn(`跳过文件 ${file.path}:`, error.message)
|
||
new Notice(`跳过文件 ${file.name}: ${error.message}`)
|
||
}
|
||
}
|
||
|
||
async DeleteFileVectorIndex(
|
||
embeddingModel: EmbeddingModel,
|
||
file: TFile
|
||
) {
|
||
await this.repository.deleteVectorsForSingleFile(file.path, embeddingModel)
|
||
}
|
||
|
||
private async cleanVectorsForDeletedFiles(
|
||
embeddingModel: EmbeddingModel,
|
||
) {
|
||
const indexedFilePaths = await this.repository.getAllIndexedFilePaths(embeddingModel)
|
||
const needToDelete = indexedFilePaths.filter(filePath => !this.app.vault.getAbstractFileByPath(filePath))
|
||
if (needToDelete.length > 0) {
|
||
await this.repository.deleteVectorsForMultipleFiles(
|
||
needToDelete,
|
||
embeddingModel,
|
||
)
|
||
}
|
||
}
|
||
|
||
private async getFilesToIndex({
|
||
embeddingModel,
|
||
excludePatterns,
|
||
includePatterns,
|
||
reindexAll,
|
||
}: {
|
||
embeddingModel: EmbeddingModel
|
||
excludePatterns: string[]
|
||
includePatterns: string[]
|
||
reindexAll?: boolean
|
||
}): Promise<TFile[]> {
|
||
let filesToIndex = this.app.vault.getMarkdownFiles()
|
||
console.log("get all vault files: ", filesToIndex.length)
|
||
|
||
filesToIndex = filesToIndex.filter((file) => {
|
||
return !excludePatterns.some((pattern) => minimatch(file.path, pattern))
|
||
})
|
||
|
||
if (includePatterns.length > 0) {
|
||
filesToIndex = filesToIndex.filter((file) => {
|
||
return includePatterns.some((pattern) => minimatch(file.path, pattern))
|
||
})
|
||
}
|
||
|
||
if (reindexAll) {
|
||
return filesToIndex
|
||
}
|
||
|
||
// 优化流程:使用数据库最大mtime来过滤需要更新的文件
|
||
try {
|
||
const maxMtime = await this.repository.getMaxMtime(embeddingModel)
|
||
console.log("Database max mtime:", maxMtime)
|
||
|
||
if (maxMtime === null) {
|
||
// 数据库中没有任何向量,需要索引所有文件
|
||
return filesToIndex
|
||
}
|
||
|
||
// 筛选出在数据库最后更新时间之后修改的文件
|
||
return filesToIndex.filter((file) => {
|
||
return file.stat.mtime > maxMtime
|
||
})
|
||
} catch (error) {
|
||
console.error("Error getting max mtime from database:", error)
|
||
return []
|
||
}
|
||
}
|
||
|
||
private async getFilesToIndexInWorkspace({
|
||
embeddingModel,
|
||
workspace,
|
||
excludePatterns,
|
||
includePatterns,
|
||
reindexAll,
|
||
}: {
|
||
embeddingModel: EmbeddingModel
|
||
workspace: Workspace
|
||
excludePatterns: string[]
|
||
includePatterns: string[]
|
||
reindexAll?: boolean
|
||
}): Promise<TFile[]> {
|
||
// 获取工作区中的所有文件
|
||
const workspaceFiles = new Set<string>()
|
||
|
||
if (workspace) {
|
||
// 处理工作区中的文件夹和标签
|
||
for (const item of workspace.content) {
|
||
if (item.type === 'folder') {
|
||
const folderPath = item.content
|
||
|
||
// 获取文件夹下的所有文件
|
||
const files = this.app.vault.getMarkdownFiles().filter(file =>
|
||
file.path.startsWith(folderPath === '/' ? '' : folderPath + '/')
|
||
)
|
||
|
||
// 添加所有文件路径
|
||
files.forEach(file => {
|
||
workspaceFiles.add(file.path)
|
||
})
|
||
|
||
} else if (item.type === 'tag') {
|
||
// 获取标签对应的所有文件
|
||
const tagFiles = getFilesWithTag(item.content, this.app)
|
||
|
||
tagFiles.forEach(filePath => {
|
||
workspaceFiles.add(filePath)
|
||
})
|
||
}
|
||
}
|
||
}
|
||
|
||
// 将路径转换为 TFile 对象
|
||
let filesToIndex = Array.from(workspaceFiles)
|
||
.map(path => this.app.vault.getFileByPath(path))
|
||
.filter((file): file is TFile => file !== null && file instanceof TFile)
|
||
|
||
console.log("get workspace files: ", filesToIndex.length)
|
||
|
||
// 应用排除和包含模式
|
||
filesToIndex = filesToIndex.filter((file) => {
|
||
return !excludePatterns.some((pattern) => minimatch(file.path, pattern))
|
||
})
|
||
|
||
if (includePatterns.length > 0) {
|
||
filesToIndex = filesToIndex.filter((file) => {
|
||
return includePatterns.some((pattern) => minimatch(file.path, pattern))
|
||
})
|
||
}
|
||
|
||
if (reindexAll) {
|
||
return filesToIndex
|
||
}
|
||
|
||
// 优化流程:使用数据库最大mtime来过滤需要更新的文件
|
||
try {
|
||
const maxMtime = await this.repository.getMaxMtime(embeddingModel)
|
||
console.log("Database max mtime:", maxMtime)
|
||
|
||
if (maxMtime === null) {
|
||
// 数据库中没有任何向量,需要索引所有文件
|
||
return filesToIndex
|
||
}
|
||
|
||
// 筛选出在数据库最后更新时间之后修改的文件
|
||
return filesToIndex.filter((file) => {
|
||
return file.stat.mtime > maxMtime
|
||
})
|
||
} catch (error) {
|
||
console.error("Error getting max mtime from database:", error)
|
||
return []
|
||
}
|
||
}
|
||
}
|