mirror of
https://github.com/EthanMarti/infio-copilot.git
synced 2026-01-16 08:21:55 +00:00
update batch files in update index
This commit is contained in:
parent
21f4734917
commit
c1fbd4da21
@ -30,7 +30,7 @@ export class VectorManager {
|
|||||||
constructor(app: App, dbManager: DBManager) {
|
constructor(app: App, dbManager: DBManager) {
|
||||||
this.app = app
|
this.app = app
|
||||||
this.dbManager = dbManager
|
this.dbManager = dbManager
|
||||||
this.repository = new VectorRepository(app, dbManager.getPgClient())
|
this.repository = new VectorRepository(app, dbManager.getPgClient() as any)
|
||||||
}
|
}
|
||||||
|
|
||||||
async performSimilaritySearch(
|
async performSimilaritySearch(
|
||||||
@ -103,13 +103,25 @@ export class VectorManager {
|
|||||||
// 强制垃圾回收的辅助方法
|
// 强制垃圾回收的辅助方法
|
||||||
private forceGarbageCollection() {
|
private forceGarbageCollection() {
|
||||||
try {
|
try {
|
||||||
if (typeof global !== 'undefined' && global.gc) {
|
// 强制垃圾回收多次,确保释放资源
|
||||||
global.gc()
|
for (let i = 0; i < 3; i++) {
|
||||||
} else if (typeof window !== 'undefined' && (window as any).gc) {
|
if (typeof global !== 'undefined' && (global as any).gc) {
|
||||||
((window as any).gc as () => void)();
|
(global as any).gc()
|
||||||
|
} else if (typeof window !== 'undefined' && (window as any).gc) {
|
||||||
|
(window as any).gc()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 强制清理一些可能的引用
|
||||||
|
if (typeof global !== 'undefined' && (global as any).gc) {
|
||||||
|
// Node.js 环境
|
||||||
|
setTimeout(() => {
|
||||||
|
(global as any).gc?.()
|
||||||
|
}, 0)
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
// 忽略垃圾回收错误
|
// 忽略垃圾回收错误
|
||||||
|
console.debug('GC error (ignored):', e)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -188,198 +200,207 @@ export class VectorManager {
|
|||||||
console.log("textSplitter chunkSize: ", options.chunkSize, "overlap: ", overlap)
|
console.log("textSplitter chunkSize: ", options.chunkSize, "overlap: ", overlap)
|
||||||
|
|
||||||
const skippedFiles: string[] = []
|
const skippedFiles: string[] = []
|
||||||
const contentChunks: InsertVector[] = (
|
const embeddingProgress = { completed: 0, totalChunks: 0 }
|
||||||
await Promise.all(
|
|
||||||
filesToIndex.map(async (file) => {
|
// 分批处理文件,每批最多50个文件(减少以避免文件句柄耗尽)
|
||||||
try {
|
const FILE_BATCH_SIZE = 50
|
||||||
let fileContent = await this.app.vault.cachedRead(file)
|
// 减少批量大小以降低内存压力
|
||||||
// 清理null字节,防止PostgreSQL UTF8编码错误
|
const embeddingBatchSize = Math.min(options.batchSize, 10)
|
||||||
fileContent = fileContent.replace(/\0/g, '')
|
|
||||||
const fileDocuments = await textSplitter.createDocuments([
|
// 首先统计总的分块数量用于进度显示
|
||||||
fileContent,
|
let totalChunks = 0
|
||||||
])
|
for (let i = 0; i < filesToIndex.length; i += FILE_BATCH_SIZE) {
|
||||||
return fileDocuments
|
const fileBatch = filesToIndex.slice(i, Math.min(i + FILE_BATCH_SIZE, filesToIndex.length))
|
||||||
.map((chunk): InsertVector | null => {
|
for (const file of fileBatch) {
|
||||||
// 保存原始内容,不在此处调用 removeMarkdown
|
try {
|
||||||
const rawContent = chunk.pageContent.replace(/\0/g, '')
|
let fileContent = await this.app.vault.cachedRead(file)
|
||||||
if (!rawContent || rawContent.trim().length === 0) {
|
fileContent = fileContent.replace(/\0/g, '')
|
||||||
return null
|
const fileDocuments = await textSplitter.createDocuments([fileContent])
|
||||||
}
|
totalChunks += fileDocuments.length
|
||||||
return {
|
} catch (error) {
|
||||||
path: file.path,
|
// 统计阶段跳过错误文件
|
||||||
mtime: file.stat.mtime,
|
}
|
||||||
content: rawContent, // 保存原始内容
|
}
|
||||||
embedding: [],
|
|
||||||
metadata: {
|
|
||||||
startLine: Number(chunk.metadata.loc.lines.from),
|
|
||||||
endLine: Number(chunk.metadata.loc.lines.to),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.filter((chunk): chunk is InsertVector => chunk !== null)
|
|
||||||
} catch (error) {
|
|
||||||
console.warn(`跳过文件 ${file.path}:`, error.message)
|
|
||||||
skippedFiles.push(file.path)
|
|
||||||
return []
|
|
||||||
}
|
|
||||||
}),
|
|
||||||
)
|
|
||||||
).flat()
|
|
||||||
|
|
||||||
console.log("contentChunks: ", contentChunks.length)
|
|
||||||
|
|
||||||
if (skippedFiles.length > 0) {
|
|
||||||
console.warn(`跳过了 ${skippedFiles.length} 个有问题的文件:`, skippedFiles)
|
|
||||||
new Notice(`跳过了 ${skippedFiles.length} 个有问题的文件`)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
embeddingProgress.totalChunks = totalChunks
|
||||||
updateProgress?.({
|
updateProgress?.({
|
||||||
completedChunks: 0,
|
completedChunks: 0,
|
||||||
totalChunks: contentChunks.length,
|
totalChunks: totalChunks,
|
||||||
totalFiles: filesToIndex.length,
|
totalFiles: filesToIndex.length,
|
||||||
})
|
})
|
||||||
|
|
||||||
const embeddingProgress = { completed: 0 }
|
|
||||||
// 减少批量大小以降低内存压力
|
|
||||||
const batchSize = Math.min(options.batchSize, 20) // 限制最大批量大小
|
|
||||||
let batchCount = 0
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (embeddingModel.supportsBatch) {
|
for (let i = 0; i < filesToIndex.length; i += FILE_BATCH_SIZE) {
|
||||||
// 支持批量处理的提供商:使用流式处理逻辑
|
const fileBatch = filesToIndex.slice(i, Math.min(i + FILE_BATCH_SIZE, filesToIndex.length))
|
||||||
for (let i = 0; i < contentChunks.length; i += batchSize) {
|
console.log(`Processing file batch ${Math.floor(i / FILE_BATCH_SIZE) + 1}/${Math.ceil(filesToIndex.length / FILE_BATCH_SIZE)} (${fileBatch.length} files)`)
|
||||||
batchCount++
|
|
||||||
const batchChunks = contentChunks.slice(i, Math.min(i + batchSize, contentChunks.length))
|
// 第一步:分块处理
|
||||||
|
const batchChunks = (
|
||||||
const embeddedBatch: InsertVector[] = []
|
await Promise.all(
|
||||||
|
fileBatch.map(async (file) => {
|
||||||
await backOff(
|
|
||||||
async () => {
|
|
||||||
// 在嵌入之前处理 markdown
|
|
||||||
const cleanedBatchData = batchChunks.map(chunk => {
|
|
||||||
const cleanContent = removeMarkdown(chunk.content)
|
|
||||||
return { chunk, cleanContent }
|
|
||||||
}).filter(({ cleanContent }) => cleanContent && cleanContent.trim().length > 0)
|
|
||||||
|
|
||||||
if (cleanedBatchData.length === 0) {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
const batchTexts = cleanedBatchData.map(({ cleanContent }) => cleanContent)
|
|
||||||
const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts)
|
|
||||||
|
|
||||||
// 合并embedding结果到chunk数据
|
|
||||||
for (let j = 0; j < cleanedBatchData.length; j++) {
|
|
||||||
const { chunk, cleanContent } = cleanedBatchData[j]
|
|
||||||
const embeddedChunk: InsertVector = {
|
|
||||||
path: chunk.path,
|
|
||||||
mtime: chunk.mtime,
|
|
||||||
content: cleanContent, // 使用已经清理过的内容
|
|
||||||
embedding: batchEmbeddings[j],
|
|
||||||
metadata: chunk.metadata,
|
|
||||||
}
|
|
||||||
embeddedBatch.push(embeddedChunk)
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
numOfAttempts: 3, // 减少重试次数
|
|
||||||
startingDelay: 500, // 减少延迟
|
|
||||||
timeMultiple: 1.5,
|
|
||||||
jitter: 'full',
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
// 使用事务批量插入,减少数据库连接压力
|
|
||||||
if (embeddedBatch.length > 0) {
|
|
||||||
await this.insertVectorsWithTransaction(embeddedBatch, embeddingModel)
|
|
||||||
console.log("insert vectors with transaction success, batch size: ", embeddedBatch.length)
|
|
||||||
// 清理批次数据
|
|
||||||
embeddedBatch.length = 0
|
|
||||||
}
|
|
||||||
|
|
||||||
embeddingProgress.completed += batchChunks.length
|
|
||||||
updateProgress?.({
|
|
||||||
completedChunks: embeddingProgress.completed,
|
|
||||||
totalChunks: contentChunks.length,
|
|
||||||
totalFiles: filesToIndex.length,
|
|
||||||
})
|
|
||||||
|
|
||||||
// 定期内存清理和延迟
|
|
||||||
await this.memoryCleanupWithDelay(batchCount)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// 不支持批量处理的提供商:大幅降低并发度
|
|
||||||
const limit = pLimit(8) // 从32降低到8,大幅减少并发压力
|
|
||||||
const abortController = new AbortController()
|
|
||||||
|
|
||||||
// 流式处理:分批处理并立即插入
|
|
||||||
for (let i = 0; i < contentChunks.length; i += batchSize) {
|
|
||||||
if (abortController.signal.aborted) {
|
|
||||||
throw new Error('Operation was aborted')
|
|
||||||
}
|
|
||||||
|
|
||||||
batchCount++
|
|
||||||
const batchChunks = contentChunks.slice(i, Math.min(i + batchSize, contentChunks.length))
|
|
||||||
const embeddedBatch: InsertVector[] = []
|
|
||||||
|
|
||||||
const tasks = batchChunks.map((chunk) =>
|
|
||||||
limit(async () => {
|
|
||||||
if (abortController.signal.aborted) {
|
|
||||||
throw new Error('Operation was aborted')
|
|
||||||
}
|
|
||||||
try {
|
try {
|
||||||
await backOff(
|
let fileContent = await this.app.vault.cachedRead(file)
|
||||||
async () => {
|
// 清理null字节,防止PostgreSQL UTF8编码错误
|
||||||
// 在嵌入之前处理 markdown
|
fileContent = fileContent.replace(/\0/g, '')
|
||||||
const cleanContent = removeMarkdown(chunk.content).replace(/\0/g, '')
|
const fileDocuments = await textSplitter.createDocuments([
|
||||||
// 跳过清理后为空的内容
|
fileContent,
|
||||||
if (!cleanContent || cleanContent.trim().length === 0) {
|
])
|
||||||
return
|
return fileDocuments
|
||||||
|
.map((chunk): InsertVector | null => {
|
||||||
|
// 保存原始内容,不在此处调用 removeMarkdown
|
||||||
|
const rawContent = chunk.pageContent.replace(/\0/g, '')
|
||||||
|
if (!rawContent || rawContent.trim().length === 0) {
|
||||||
|
return null
|
||||||
}
|
}
|
||||||
|
return {
|
||||||
const embedding = await embeddingModel.getEmbedding(cleanContent)
|
path: file.path,
|
||||||
const embeddedChunk = {
|
mtime: file.stat.mtime,
|
||||||
path: chunk.path,
|
content: rawContent, // 保存原始内容
|
||||||
mtime: chunk.mtime,
|
embedding: [],
|
||||||
content: cleanContent, // 使用清理后的内容
|
metadata: {
|
||||||
embedding,
|
startLine: Number(chunk.metadata.loc.lines.from),
|
||||||
metadata: chunk.metadata,
|
endLine: Number(chunk.metadata.loc.lines.to),
|
||||||
|
},
|
||||||
}
|
}
|
||||||
embeddedBatch.push(embeddedChunk)
|
})
|
||||||
},
|
.filter((chunk): chunk is InsertVector => chunk !== null)
|
||||||
{
|
|
||||||
numOfAttempts: 3,
|
|
||||||
startingDelay: 1000, // 增加延迟
|
|
||||||
timeMultiple: 2.0,
|
|
||||||
jitter: 'full',
|
|
||||||
},
|
|
||||||
)
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('Error in embedding task:', error)
|
console.warn(`跳过文件 ${file.path}:`, error.message)
|
||||||
// 不要立即中止,继续处理其他任务
|
skippedFiles.push(file.path)
|
||||||
|
return []
|
||||||
}
|
}
|
||||||
}),
|
}),
|
||||||
)
|
)
|
||||||
|
).flat()
|
||||||
await Promise.all(tasks)
|
|
||||||
|
if (batchChunks.length === 0) {
|
||||||
// 使用事务批量插入
|
continue
|
||||||
if (embeddedBatch.length > 0) {
|
|
||||||
await this.insertVectorsWithTransaction(embeddedBatch, embeddingModel)
|
|
||||||
// 清理批次数据
|
|
||||||
embeddedBatch.length = 0
|
|
||||||
}
|
|
||||||
|
|
||||||
embeddingProgress.completed += batchChunks.length
|
|
||||||
updateProgress?.({
|
|
||||||
completedChunks: embeddingProgress.completed,
|
|
||||||
totalChunks: contentChunks.length,
|
|
||||||
totalFiles: filesToIndex.length,
|
|
||||||
})
|
|
||||||
|
|
||||||
// 定期内存清理和延迟
|
|
||||||
await this.memoryCleanupWithDelay(batchCount)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 第二步:嵌入处理
|
||||||
|
console.log(`Embedding ${batchChunks.length} chunks for current file batch`)
|
||||||
|
|
||||||
|
if (embeddingModel.supportsBatch) {
|
||||||
|
// 支持批量处理的提供商
|
||||||
|
for (let j = 0; j < batchChunks.length; j += embeddingBatchSize) {
|
||||||
|
const embeddingBatch = batchChunks.slice(j, Math.min(j + embeddingBatchSize, batchChunks.length))
|
||||||
|
const embeddedBatch: InsertVector[] = []
|
||||||
|
|
||||||
|
await backOff(
|
||||||
|
async () => {
|
||||||
|
// 在嵌入之前处理 markdown
|
||||||
|
const cleanedBatchData = embeddingBatch.map(chunk => {
|
||||||
|
const cleanContent = removeMarkdown(chunk.content)
|
||||||
|
return { chunk, cleanContent }
|
||||||
|
}).filter(({ cleanContent }) => cleanContent && cleanContent.trim().length > 0)
|
||||||
|
|
||||||
|
if (cleanedBatchData.length === 0) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
const batchTexts = cleanedBatchData.map(({ cleanContent }) => cleanContent)
|
||||||
|
const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts)
|
||||||
|
|
||||||
|
// 合并embedding结果到chunk数据
|
||||||
|
for (let k = 0; k < cleanedBatchData.length; k++) {
|
||||||
|
const { chunk, cleanContent } = cleanedBatchData[k]
|
||||||
|
const embeddedChunk: InsertVector = {
|
||||||
|
path: chunk.path,
|
||||||
|
mtime: chunk.mtime,
|
||||||
|
content: cleanContent, // 使用已经清理过的内容
|
||||||
|
embedding: batchEmbeddings[k],
|
||||||
|
metadata: chunk.metadata,
|
||||||
|
}
|
||||||
|
embeddedBatch.push(embeddedChunk)
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
numOfAttempts: 3,
|
||||||
|
startingDelay: 500,
|
||||||
|
timeMultiple: 1.5,
|
||||||
|
jitter: 'full',
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
// 第三步:立即存储
|
||||||
|
if (embeddedBatch.length > 0) {
|
||||||
|
await this.insertVectorsWithTransaction(embeddedBatch, embeddingModel)
|
||||||
|
console.log(`Stored ${embeddedBatch.length} embedded chunks`)
|
||||||
|
}
|
||||||
|
|
||||||
|
embeddingProgress.completed += embeddingBatch.length
|
||||||
|
updateProgress?.({
|
||||||
|
completedChunks: embeddingProgress.completed,
|
||||||
|
totalChunks: embeddingProgress.totalChunks,
|
||||||
|
totalFiles: filesToIndex.length,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// 不支持批量处理的提供商(减少并发度以避免文件句柄耗尽)
|
||||||
|
const limit = pLimit(3)
|
||||||
|
|
||||||
|
for (let j = 0; j < batchChunks.length; j += embeddingBatchSize) {
|
||||||
|
const embeddingBatch = batchChunks.slice(j, Math.min(j + embeddingBatchSize, batchChunks.length))
|
||||||
|
const embeddedBatch: InsertVector[] = []
|
||||||
|
|
||||||
|
const tasks = embeddingBatch.map((chunk) =>
|
||||||
|
limit(async () => {
|
||||||
|
try {
|
||||||
|
await backOff(
|
||||||
|
async () => {
|
||||||
|
// 在嵌入之前处理 markdown
|
||||||
|
const cleanContent = removeMarkdown(chunk.content).replace(/\0/g, '')
|
||||||
|
// 跳过清理后为空的内容
|
||||||
|
if (!cleanContent || cleanContent.trim().length === 0) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
const embedding = await embeddingModel.getEmbedding(cleanContent)
|
||||||
|
const embeddedChunk = {
|
||||||
|
path: chunk.path,
|
||||||
|
mtime: chunk.mtime,
|
||||||
|
content: cleanContent, // 使用清理后的内容
|
||||||
|
embedding,
|
||||||
|
metadata: chunk.metadata,
|
||||||
|
}
|
||||||
|
embeddedBatch.push(embeddedChunk)
|
||||||
|
},
|
||||||
|
{
|
||||||
|
numOfAttempts: 3,
|
||||||
|
startingDelay: 1000,
|
||||||
|
timeMultiple: 2.0,
|
||||||
|
jitter: 'full',
|
||||||
|
},
|
||||||
|
)
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error in embedding task:', error)
|
||||||
|
}
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
|
||||||
|
await Promise.all(tasks)
|
||||||
|
|
||||||
|
// 第三步:立即存储
|
||||||
|
if (embeddedBatch.length > 0) {
|
||||||
|
await this.insertVectorsWithTransaction(embeddedBatch, embeddingModel)
|
||||||
|
console.log(`Stored ${embeddedBatch.length} embedded chunks`)
|
||||||
|
}
|
||||||
|
|
||||||
|
embeddingProgress.completed += embeddingBatch.length
|
||||||
|
updateProgress?.({
|
||||||
|
completedChunks: embeddingProgress.completed,
|
||||||
|
totalChunks: embeddingProgress.totalChunks,
|
||||||
|
totalFiles: filesToIndex.length,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 每批文件处理完后进行强制资源清理
|
||||||
|
await this.forceResourceCleanup()
|
||||||
|
|
||||||
|
// 额外延迟以允许系统释放文件句柄
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 500))
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (
|
if (
|
||||||
@ -395,8 +416,13 @@ export class VectorManager {
|
|||||||
throw error
|
throw error
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
// 最终清理
|
// 最终强制清理
|
||||||
this.forceGarbageCollection()
|
await this.forceResourceCleanup()
|
||||||
|
}
|
||||||
|
|
||||||
|
if (skippedFiles.length > 0) {
|
||||||
|
console.warn(`跳过了 ${skippedFiles.length} 个有问题的文件:`, skippedFiles)
|
||||||
|
new Notice(`跳过了 ${skippedFiles.length} 个有问题的文件`)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -472,197 +498,207 @@ export class VectorManager {
|
|||||||
console.log("textSplitter chunkSize: ", options.chunkSize, "overlap: ", overlap)
|
console.log("textSplitter chunkSize: ", options.chunkSize, "overlap: ", overlap)
|
||||||
|
|
||||||
const skippedFiles: string[] = []
|
const skippedFiles: string[] = []
|
||||||
const contentChunks: InsertVector[] = (
|
const embeddingProgress = { completed: 0, totalChunks: 0 }
|
||||||
await Promise.all(
|
|
||||||
filesToIndex.map(async (file) => {
|
// 分批处理文件,每批最多50个文件(减少以避免文件句柄耗尽)
|
||||||
try {
|
const FILE_BATCH_SIZE = 50
|
||||||
let fileContent = await this.app.vault.cachedRead(file)
|
// 减少批量大小以降低内存压力
|
||||||
// 清理null字节,防止PostgreSQL UTF8编码错误
|
const embeddingBatchSize = Math.min(options.batchSize, 10)
|
||||||
fileContent = fileContent.replace(/\0/g, '')
|
|
||||||
const fileDocuments = await textSplitter.createDocuments([
|
// 首先统计总的分块数量用于进度显示
|
||||||
fileContent,
|
let totalChunks = 0
|
||||||
])
|
for (let i = 0; i < filesToIndex.length; i += FILE_BATCH_SIZE) {
|
||||||
return fileDocuments
|
const fileBatch = filesToIndex.slice(i, Math.min(i + FILE_BATCH_SIZE, filesToIndex.length))
|
||||||
.map((chunk): InsertVector | null => {
|
for (const file of fileBatch) {
|
||||||
// 保存原始内容,不在此处调用 removeMarkdown
|
try {
|
||||||
const rawContent = chunk.pageContent.replace(/\0/g, '')
|
let fileContent = await this.app.vault.cachedRead(file)
|
||||||
if (!rawContent || rawContent.trim().length === 0) {
|
fileContent = fileContent.replace(/\0/g, '')
|
||||||
return null
|
const fileDocuments = await textSplitter.createDocuments([fileContent])
|
||||||
}
|
totalChunks += fileDocuments.length
|
||||||
return {
|
} catch (error) {
|
||||||
path: file.path,
|
// 统计阶段跳过错误文件
|
||||||
mtime: file.stat.mtime,
|
}
|
||||||
content: rawContent, // 保存原始内容
|
}
|
||||||
embedding: [],
|
|
||||||
metadata: {
|
|
||||||
startLine: Number(chunk.metadata.loc.lines.from),
|
|
||||||
endLine: Number(chunk.metadata.loc.lines.to),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.filter((chunk): chunk is InsertVector => chunk !== null)
|
|
||||||
} catch (error) {
|
|
||||||
console.warn(`跳过文件 ${file.path}:`, error.message)
|
|
||||||
skippedFiles.push(file.path)
|
|
||||||
return []
|
|
||||||
}
|
|
||||||
}),
|
|
||||||
)
|
|
||||||
).flat()
|
|
||||||
|
|
||||||
console.log("contentChunks: ", contentChunks.length)
|
|
||||||
|
|
||||||
if (skippedFiles.length > 0) {
|
|
||||||
console.warn(`跳过了 ${skippedFiles.length} 个有问题的文件:`, skippedFiles)
|
|
||||||
new Notice(`跳过了 ${skippedFiles.length} 个有问题的文件`)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
embeddingProgress.totalChunks = totalChunks
|
||||||
updateProgress?.({
|
updateProgress?.({
|
||||||
completedChunks: 0,
|
completedChunks: 0,
|
||||||
totalChunks: contentChunks.length,
|
totalChunks: totalChunks,
|
||||||
totalFiles: filesToIndex.length,
|
totalFiles: filesToIndex.length,
|
||||||
})
|
})
|
||||||
|
|
||||||
const embeddingProgress = { completed: 0 }
|
|
||||||
// 减少批量大小以降低内存压力
|
|
||||||
const batchSize = Math.min(options.batchSize, 20) // 限制最大批量大小
|
|
||||||
let batchCount = 0
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (embeddingModel.supportsBatch) {
|
for (let i = 0; i < filesToIndex.length; i += FILE_BATCH_SIZE) {
|
||||||
// 支持批量处理的提供商:使用流式处理逻辑
|
const fileBatch = filesToIndex.slice(i, Math.min(i + FILE_BATCH_SIZE, filesToIndex.length))
|
||||||
for (let i = 0; i < contentChunks.length; i += batchSize) {
|
console.log(`Processing workspace file batch ${Math.floor(i / FILE_BATCH_SIZE) + 1}/${Math.ceil(filesToIndex.length / FILE_BATCH_SIZE)} (${fileBatch.length} files)`)
|
||||||
batchCount++
|
|
||||||
const batchChunks = contentChunks.slice(i, Math.min(i + batchSize, contentChunks.length))
|
// 第一步:分块处理
|
||||||
|
const batchChunks = (
|
||||||
const embeddedBatch: InsertVector[] = []
|
await Promise.all(
|
||||||
|
fileBatch.map(async (file) => {
|
||||||
await backOff(
|
|
||||||
async () => {
|
|
||||||
// 在嵌入之前处理 markdown,只处理一次
|
|
||||||
const cleanedBatchData = batchChunks.map(chunk => {
|
|
||||||
const cleanContent = removeMarkdown(chunk.content)
|
|
||||||
return { chunk, cleanContent }
|
|
||||||
}).filter(({ cleanContent }) => cleanContent && cleanContent.trim().length > 0)
|
|
||||||
|
|
||||||
if (cleanedBatchData.length === 0) {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
const batchTexts = cleanedBatchData.map(({ cleanContent }) => cleanContent)
|
|
||||||
const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts)
|
|
||||||
|
|
||||||
// 合并embedding结果到chunk数据
|
|
||||||
for (let j = 0; j < cleanedBatchData.length; j++) {
|
|
||||||
const { chunk, cleanContent } = cleanedBatchData[j]
|
|
||||||
const embeddedChunk: InsertVector = {
|
|
||||||
path: chunk.path,
|
|
||||||
mtime: chunk.mtime,
|
|
||||||
content: cleanContent, // 使用已经清理过的内容
|
|
||||||
embedding: batchEmbeddings[j],
|
|
||||||
metadata: chunk.metadata,
|
|
||||||
}
|
|
||||||
embeddedBatch.push(embeddedChunk)
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
numOfAttempts: 3,
|
|
||||||
startingDelay: 1000, // 增加延迟
|
|
||||||
timeMultiple: 2.0,
|
|
||||||
jitter: 'full',
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
// 使用事务批量插入,减少数据库连接压力
|
|
||||||
if (embeddedBatch.length > 0) {
|
|
||||||
await this.insertVectorsWithTransaction(embeddedBatch, embeddingModel)
|
|
||||||
// 清理批次数据
|
|
||||||
embeddedBatch.length = 0
|
|
||||||
}
|
|
||||||
|
|
||||||
embeddingProgress.completed += batchChunks.length
|
|
||||||
updateProgress?.({
|
|
||||||
completedChunks: embeddingProgress.completed,
|
|
||||||
totalChunks: contentChunks.length,
|
|
||||||
totalFiles: filesToIndex.length,
|
|
||||||
})
|
|
||||||
|
|
||||||
// 定期内存清理和延迟
|
|
||||||
await this.memoryCleanupWithDelay(batchCount)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// 不支持批量处理的提供商:大幅降低并发度
|
|
||||||
const limit = pLimit(8) // 从32降低到8,大幅减少并发压力
|
|
||||||
const abortController = new AbortController()
|
|
||||||
|
|
||||||
// 流式处理:分批处理并立即插入
|
|
||||||
for (let i = 0; i < contentChunks.length; i += batchSize) {
|
|
||||||
if (abortController.signal.aborted) {
|
|
||||||
throw new Error('Operation was aborted')
|
|
||||||
}
|
|
||||||
|
|
||||||
batchCount++
|
|
||||||
const batchChunks = contentChunks.slice(i, Math.min(i + batchSize, contentChunks.length))
|
|
||||||
const embeddedBatch: InsertVector[] = []
|
|
||||||
|
|
||||||
const tasks = batchChunks.map((chunk) =>
|
|
||||||
limit(async () => {
|
|
||||||
if (abortController.signal.aborted) {
|
|
||||||
throw new Error('Operation was aborted')
|
|
||||||
}
|
|
||||||
try {
|
try {
|
||||||
await backOff(
|
let fileContent = await this.app.vault.cachedRead(file)
|
||||||
async () => {
|
// 清理null字节,防止PostgreSQL UTF8编码错误
|
||||||
// 在嵌入之前处理 markdown
|
fileContent = fileContent.replace(/\0/g, '')
|
||||||
const cleanContent = removeMarkdown(chunk.content).replace(/\0/g, '')
|
const fileDocuments = await textSplitter.createDocuments([
|
||||||
// 跳过清理后为空的内容
|
fileContent,
|
||||||
if (!cleanContent || cleanContent.trim().length === 0) {
|
])
|
||||||
return
|
return fileDocuments
|
||||||
|
.map((chunk): InsertVector | null => {
|
||||||
|
// 保存原始内容,不在此处调用 removeMarkdown
|
||||||
|
const rawContent = chunk.pageContent.replace(/\0/g, '')
|
||||||
|
if (!rawContent || rawContent.trim().length === 0) {
|
||||||
|
return null
|
||||||
}
|
}
|
||||||
|
return {
|
||||||
const embedding = await embeddingModel.getEmbedding(cleanContent)
|
path: file.path,
|
||||||
const embeddedChunk = {
|
mtime: file.stat.mtime,
|
||||||
path: chunk.path,
|
content: rawContent, // 保存原始内容
|
||||||
mtime: chunk.mtime,
|
embedding: [],
|
||||||
content: cleanContent, // 使用清理后的内容
|
metadata: {
|
||||||
embedding,
|
startLine: Number(chunk.metadata.loc.lines.from),
|
||||||
metadata: chunk.metadata,
|
endLine: Number(chunk.metadata.loc.lines.to),
|
||||||
|
},
|
||||||
}
|
}
|
||||||
embeddedBatch.push(embeddedChunk)
|
})
|
||||||
},
|
.filter((chunk): chunk is InsertVector => chunk !== null)
|
||||||
{
|
|
||||||
numOfAttempts: 3,
|
|
||||||
startingDelay: 1000, // 增加延迟
|
|
||||||
timeMultiple: 2.0,
|
|
||||||
jitter: 'full',
|
|
||||||
},
|
|
||||||
)
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('Error in embedding task:', error)
|
console.warn(`跳过文件 ${file.path}:`, error.message)
|
||||||
// 不要立即中止,继续处理其他任务
|
skippedFiles.push(file.path)
|
||||||
|
return []
|
||||||
}
|
}
|
||||||
}),
|
}),
|
||||||
)
|
)
|
||||||
|
).flat()
|
||||||
await Promise.all(tasks)
|
|
||||||
|
if (batchChunks.length === 0) {
|
||||||
// 使用事务批量插入
|
continue
|
||||||
if (embeddedBatch.length > 0) {
|
|
||||||
await this.insertVectorsWithTransaction(embeddedBatch, embeddingModel)
|
|
||||||
// 清理批次数据
|
|
||||||
embeddedBatch.length = 0
|
|
||||||
}
|
|
||||||
|
|
||||||
embeddingProgress.completed += batchChunks.length
|
|
||||||
updateProgress?.({
|
|
||||||
completedChunks: embeddingProgress.completed,
|
|
||||||
totalChunks: contentChunks.length,
|
|
||||||
totalFiles: filesToIndex.length,
|
|
||||||
})
|
|
||||||
|
|
||||||
// 定期内存清理和延迟
|
|
||||||
await this.memoryCleanupWithDelay(batchCount)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 第二步:嵌入处理
|
||||||
|
console.log(`Embedding ${batchChunks.length} chunks for current workspace file batch`)
|
||||||
|
|
||||||
|
if (embeddingModel.supportsBatch) {
|
||||||
|
// 支持批量处理的提供商
|
||||||
|
for (let j = 0; j < batchChunks.length; j += embeddingBatchSize) {
|
||||||
|
const embeddingBatch = batchChunks.slice(j, Math.min(j + embeddingBatchSize, batchChunks.length))
|
||||||
|
const embeddedBatch: InsertVector[] = []
|
||||||
|
|
||||||
|
await backOff(
|
||||||
|
async () => {
|
||||||
|
// 在嵌入之前处理 markdown
|
||||||
|
const cleanedBatchData = embeddingBatch.map(chunk => {
|
||||||
|
const cleanContent = removeMarkdown(chunk.content)
|
||||||
|
return { chunk, cleanContent }
|
||||||
|
}).filter(({ cleanContent }) => cleanContent && cleanContent.trim().length > 0)
|
||||||
|
|
||||||
|
if (cleanedBatchData.length === 0) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
const batchTexts = cleanedBatchData.map(({ cleanContent }) => cleanContent)
|
||||||
|
const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts)
|
||||||
|
|
||||||
|
// 合并embedding结果到chunk数据
|
||||||
|
for (let k = 0; k < cleanedBatchData.length; k++) {
|
||||||
|
const { chunk, cleanContent } = cleanedBatchData[k]
|
||||||
|
const embeddedChunk: InsertVector = {
|
||||||
|
path: chunk.path,
|
||||||
|
mtime: chunk.mtime,
|
||||||
|
content: cleanContent, // 使用已经清理过的内容
|
||||||
|
embedding: batchEmbeddings[k],
|
||||||
|
metadata: chunk.metadata,
|
||||||
|
}
|
||||||
|
embeddedBatch.push(embeddedChunk)
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
numOfAttempts: 3,
|
||||||
|
startingDelay: 1000,
|
||||||
|
timeMultiple: 2.0,
|
||||||
|
jitter: 'full',
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
// 第三步:立即存储
|
||||||
|
if (embeddedBatch.length > 0) {
|
||||||
|
await this.insertVectorsWithTransaction(embeddedBatch, embeddingModel)
|
||||||
|
console.log(`Stored ${embeddedBatch.length} embedded chunks for workspace`)
|
||||||
|
}
|
||||||
|
|
||||||
|
embeddingProgress.completed += embeddingBatch.length
|
||||||
|
updateProgress?.({
|
||||||
|
completedChunks: embeddingProgress.completed,
|
||||||
|
totalChunks: embeddingProgress.totalChunks,
|
||||||
|
totalFiles: filesToIndex.length,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// 不支持批量处理的提供商(减少并发度以避免文件句柄耗尽)
|
||||||
|
const limit = pLimit(3)
|
||||||
|
|
||||||
|
for (let j = 0; j < batchChunks.length; j += embeddingBatchSize) {
|
||||||
|
const embeddingBatch = batchChunks.slice(j, Math.min(j + embeddingBatchSize, batchChunks.length))
|
||||||
|
const embeddedBatch: InsertVector[] = []
|
||||||
|
|
||||||
|
const tasks = embeddingBatch.map((chunk) =>
|
||||||
|
limit(async () => {
|
||||||
|
try {
|
||||||
|
await backOff(
|
||||||
|
async () => {
|
||||||
|
// 在嵌入之前处理 markdown
|
||||||
|
const cleanContent = removeMarkdown(chunk.content).replace(/\0/g, '')
|
||||||
|
// 跳过清理后为空的内容
|
||||||
|
if (!cleanContent || cleanContent.trim().length === 0) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
const embedding = await embeddingModel.getEmbedding(cleanContent)
|
||||||
|
const embeddedChunk = {
|
||||||
|
path: chunk.path,
|
||||||
|
mtime: chunk.mtime,
|
||||||
|
content: cleanContent, // 使用清理后的内容
|
||||||
|
embedding,
|
||||||
|
metadata: chunk.metadata,
|
||||||
|
}
|
||||||
|
embeddedBatch.push(embeddedChunk)
|
||||||
|
},
|
||||||
|
{
|
||||||
|
numOfAttempts: 3,
|
||||||
|
startingDelay: 1000,
|
||||||
|
timeMultiple: 2.0,
|
||||||
|
jitter: 'full',
|
||||||
|
},
|
||||||
|
)
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error in embedding task:', error)
|
||||||
|
}
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
|
||||||
|
await Promise.all(tasks)
|
||||||
|
|
||||||
|
// 第三步:立即存储
|
||||||
|
if (embeddedBatch.length > 0) {
|
||||||
|
await this.insertVectorsWithTransaction(embeddedBatch, embeddingModel)
|
||||||
|
console.log(`Stored ${embeddedBatch.length} embedded chunks for workspace`)
|
||||||
|
}
|
||||||
|
|
||||||
|
embeddingProgress.completed += embeddingBatch.length
|
||||||
|
updateProgress?.({
|
||||||
|
completedChunks: embeddingProgress.completed,
|
||||||
|
totalChunks: embeddingProgress.totalChunks,
|
||||||
|
totalFiles: filesToIndex.length,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 每批文件处理完后进行强制资源清理
|
||||||
|
await this.forceResourceCleanup()
|
||||||
|
|
||||||
|
// 额外延迟以允许系统释放文件句柄
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 500))
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (
|
if (
|
||||||
@ -678,8 +714,13 @@ export class VectorManager {
|
|||||||
throw error
|
throw error
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
// 最终清理
|
// 最终强制清理
|
||||||
this.forceGarbageCollection()
|
await this.forceResourceCleanup()
|
||||||
|
}
|
||||||
|
|
||||||
|
if (skippedFiles.length > 0) {
|
||||||
|
console.warn(`跳过了 ${skippedFiles.length} 个有问题的文件:`, skippedFiles)
|
||||||
|
new Notice(`跳过了 ${skippedFiles.length} 个有问题的文件`)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1034,14 +1075,26 @@ export class VectorManager {
|
|||||||
|
|
||||||
// 增强的内存清理方法,增加延迟
|
// 增强的内存清理方法,增加延迟
|
||||||
private async memoryCleanupWithDelay(batchCount: number) {
|
private async memoryCleanupWithDelay(batchCount: number) {
|
||||||
// 每5批次强制垃圾回收和延迟
|
// 每3批次强制垃圾回收和延迟
|
||||||
if (batchCount % 5 === 0) {
|
if (batchCount % 3 === 0) {
|
||||||
this.forceGarbageCollection()
|
this.forceGarbageCollection()
|
||||||
// 增加延迟让系统有时间处理
|
// 增加延迟让系统有时间处理和释放文件句柄
|
||||||
await new Promise(resolve => setTimeout(resolve, 500))
|
await new Promise(resolve => setTimeout(resolve, 1000))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 强制内存和资源清理
|
||||||
|
private async forceResourceCleanup() {
|
||||||
|
// 多次垃圾回收
|
||||||
|
for (let i = 0; i < 5; i++) {
|
||||||
|
this.forceGarbageCollection()
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 100))
|
||||||
|
}
|
||||||
|
|
||||||
|
// 额外延迟让系统释放资源
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 500))
|
||||||
|
}
|
||||||
|
|
||||||
// 使用事务插入向量的方法
|
// 使用事务插入向量的方法
|
||||||
private async insertVectorsWithTransaction(
|
private async insertVectorsWithTransaction(
|
||||||
data: InsertVector[],
|
data: InsertVector[],
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user