update embeddings

This commit is contained in:
duanfuxiang 2025-06-14 09:17:44 +08:00
parent c71a13a659
commit f1ecc16c26
4 changed files with 322 additions and 211 deletions

View File

@ -258,6 +258,8 @@ export const InlineEdit: React.FC<InlineEditProps> = ({
let fileContent: string; let fileContent: string;
try { try {
fileContent = await plugin.app.vault.cachedRead(activeFile); fileContent = await plugin.app.vault.cachedRead(activeFile);
// 清理null字节防止PostgreSQL UTF8编码错误
fileContent = fileContent.replace(/\0/g, '');
} catch (err) { } catch (err) {
const error = err as Error; const error = err as Error;
console.error(t("inlineEdit.readFileError"), error.message); console.error(t("inlineEdit.readFileError"), error.message);
@ -278,7 +280,9 @@ export const InlineEdit: React.FC<InlineEditProps> = ({
return; return;
} }
const oldContent = await plugin.app.vault.read(activeFile); let oldContent = await plugin.app.vault.read(activeFile);
// 清理null字节防止PostgreSQL UTF8编码错误
oldContent = oldContent.replace(/\0/g, '');
await plugin.app.workspace.getLeaf(true).setViewState({ await plugin.app.workspace.getLeaf(true).setViewState({
type: APPLY_VIEW_TYPE, type: APPLY_VIEW_TYPE,
active: true, active: true,

View File

@ -56,7 +56,9 @@ export async function matchSearchUsingCorePlugin(
break; break;
} }
const content = await vault.cachedRead(file as TFile); let content = await vault.cachedRead(file as TFile);
// 清理null字节防止PostgreSQL UTF8编码错误
content = content.replace(/\0/g, '');
const lines = content.split('\n'); const lines = content.split('\n');
// `fileMatches.result.content` holds an array of matches for the file. // `fileMatches.result.content` holds an array of matches for the file.

View File

@ -52,6 +52,29 @@ export class VectorManager {
) )
} }
// 强制垃圾回收的辅助方法
private forceGarbageCollection() {
try {
if (typeof global !== 'undefined' && global.gc) {
global.gc()
} else if (typeof window !== 'undefined' && (window as any).gc) {
(window as any).gc()
}
} catch (e) {
// 忽略垃圾回收错误
}
}
// 检查并清理内存的辅助方法
private async memoryCleanup(batchCount: number) {
// 每10批次强制垃圾回收
if (batchCount % 10 === 0) {
this.forceGarbageCollection()
// 短暂延迟让内存清理完成
await new Promise(resolve => setTimeout(resolve, 100))
}
}
async updateVaultIndex( async updateVaultIndex(
embeddingModel: EmbeddingModel, embeddingModel: EmbeddingModel,
options: { options: {
@ -100,29 +123,43 @@ export class VectorManager {
}, },
) )
const skippedFiles: string[] = []
const contentChunks: InsertVector[] = ( const contentChunks: InsertVector[] = (
await Promise.all( await Promise.all(
filesToIndex.map(async (file) => { filesToIndex.map(async (file) => {
const fileContent = await this.app.vault.cachedRead(file) try {
const fileDocuments = await textSplitter.createDocuments([ let fileContent = await this.app.vault.cachedRead(file)
fileContent, // 清理null字节防止PostgreSQL UTF8编码错误
]) fileContent = fileContent.replace(/\0/g, '')
return fileDocuments.map((chunk): InsertVector => { const fileDocuments = await textSplitter.createDocuments([
return { fileContent,
path: file.path, ])
mtime: file.stat.mtime, return fileDocuments.map((chunk): InsertVector => {
content: chunk.pageContent, return {
embedding: [], path: file.path,
metadata: { mtime: file.stat.mtime,
startLine: Number(chunk.metadata.loc.lines.from), content: chunk.pageContent.replace(/\0/g, ''), // 再次清理,确保安全
endLine: Number(chunk.metadata.loc.lines.to), embedding: [],
}, metadata: {
} startLine: Number(chunk.metadata.loc.lines.from),
}) endLine: Number(chunk.metadata.loc.lines.to),
},
}
})
} catch (error) {
console.warn(`跳过文件 ${file.path}:`, error.message)
skippedFiles.push(file.path)
return []
}
}), }),
) )
).flat() ).flat()
if (skippedFiles.length > 0) {
console.warn(`跳过了 ${skippedFiles.length} 个有问题的文件:`, skippedFiles)
new Notice(`跳过了 ${skippedFiles.length} 个有问题的文件`)
}
updateProgress?.({ updateProgress?.({
completedChunks: 0, completedChunks: 0,
totalChunks: contentChunks.length, totalChunks: contentChunks.length,
@ -130,18 +167,22 @@ export class VectorManager {
}) })
const embeddingProgress = { completed: 0 } const embeddingProgress = { completed: 0 }
const embeddingChunks: InsertVector[] = [] // 减少批量大小以降低内存压力
const insertBatchSize = 64 // 数据库插入批量大小 const insertBatchSize = 16 // 从64降低到16
let batchCount = 0
try { try {
if (embeddingModel.supportsBatch) { if (embeddingModel.supportsBatch) {
// 支持批量处理的提供商:使用批量处理逻辑 // 支持批量处理的提供商:使用流式处理逻辑
const embeddingBatchSize = 64 // API批量处理大小 const embeddingBatchSize = 16 // 从64降低到16
for (let i = 0; i < contentChunks.length; i += embeddingBatchSize) { for (let i = 0; i < contentChunks.length; i += embeddingBatchSize) {
batchCount++
const batchChunks = contentChunks.slice(i, Math.min(i + embeddingBatchSize, contentChunks.length)) const batchChunks = contentChunks.slice(i, Math.min(i + embeddingBatchSize, contentChunks.length))
const batchTexts = batchChunks.map(chunk => chunk.content) const batchTexts = batchChunks.map(chunk => chunk.content)
const embeddedBatch: InsertVector[] = []
await backOff( await backOff(
async () => { async () => {
const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts) const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts)
@ -155,80 +196,99 @@ export class VectorManager {
embedding: batchEmbeddings[j], embedding: batchEmbeddings[j],
metadata: batchChunks[j].metadata, metadata: batchChunks[j].metadata,
} }
embeddingChunks.push(embeddedChunk) embeddedBatch.push(embeddedChunk)
} }
embeddingProgress.completed += batchChunks.length
updateProgress?.({
completedChunks: embeddingProgress.completed,
totalChunks: contentChunks.length,
totalFiles: filesToIndex.length,
})
}, },
{ {
numOfAttempts: 5, numOfAttempts: 3, // 减少重试次数
startingDelay: 1000, startingDelay: 500, // 减少延迟
timeMultiple: 1.5, timeMultiple: 1.5,
jitter: 'full', jitter: 'full',
}, },
) )
// 立即插入当前批次,避免内存累积
if (embeddedBatch.length > 0) {
await this.repository.insertVectors(embeddedBatch, embeddingModel)
// 清理批次数据
embeddedBatch.length = 0
}
embeddingProgress.completed += batchChunks.length
updateProgress?.({
completedChunks: embeddingProgress.completed,
totalChunks: contentChunks.length,
totalFiles: filesToIndex.length,
})
// 定期内存清理
await this.memoryCleanup(batchCount)
} }
} else { } else {
// 不支持批量处理的提供商:使用原来的逐个处理逻辑 // 不支持批量处理的提供商:使用流式处理逻辑
const limit = pLimit(50) const limit = pLimit(10) // 从50降低到10减少并发压力
const abortController = new AbortController() const abortController = new AbortController()
const tasks = contentChunks.map((chunk) =>
limit(async () => {
if (abortController.signal.aborted) {
throw new Error('Operation was aborted')
}
try {
await backOff(
async () => {
const embedding = await embeddingModel.getEmbedding(chunk.content)
const embeddedChunk = {
path: chunk.path,
mtime: chunk.mtime,
content: chunk.content,
embedding,
metadata: chunk.metadata,
}
embeddingChunks.push(embeddedChunk)
embeddingProgress.completed++
updateProgress?.({
completedChunks: embeddingProgress.completed,
totalChunks: contentChunks.length,
totalFiles: filesToIndex.length,
})
},
{
numOfAttempts: 5,
startingDelay: 1000,
timeMultiple: 1.5,
jitter: 'full',
},
)
} catch (error) {
abortController.abort()
throw error
}
}),
)
await Promise.all(tasks) // 流式处理:分批处理并立即插入
} for (let i = 0; i < contentChunks.length; i += insertBatchSize) {
if (abortController.signal.aborted) {
throw new Error('Operation was aborted')
}
// all embedding generated, batch insert batchCount++
if (embeddingChunks.length > 0) { const batchChunks = contentChunks.slice(i, Math.min(i + insertBatchSize, contentChunks.length))
// batch insert all vectors const embeddedBatch: InsertVector[] = []
let inserted = 0
while (inserted < embeddingChunks.length) { const tasks = batchChunks.map((chunk) =>
const chunksToInsert = embeddingChunks.slice( limit(async () => {
inserted, if (abortController.signal.aborted) {
Math.min(inserted + insertBatchSize, embeddingChunks.length) throw new Error('Operation was aborted')
}
try {
await backOff(
async () => {
const embedding = await embeddingModel.getEmbedding(chunk.content)
const embeddedChunk = {
path: chunk.path,
mtime: chunk.mtime,
content: chunk.content,
embedding,
metadata: chunk.metadata,
}
embeddedBatch.push(embeddedChunk)
},
{
numOfAttempts: 3, // 减少重试次数
startingDelay: 500, // 减少延迟
timeMultiple: 1.5,
jitter: 'full',
},
)
} catch (error) {
abortController.abort()
throw error
}
}),
) )
await this.repository.insertVectors(chunksToInsert, embeddingModel)
inserted += chunksToInsert.length await Promise.all(tasks)
// 立即插入当前批次
if (embeddedBatch.length > 0) {
await this.repository.insertVectors(embeddedBatch, embeddingModel)
// 清理批次数据
embeddedBatch.length = 0
}
embeddingProgress.completed += batchChunks.length
updateProgress?.({
completedChunks: embeddingProgress.completed,
totalChunks: contentChunks.length,
totalFiles: filesToIndex.length,
})
// 定期内存清理
await this.memoryCleanup(batchCount)
} }
} }
} catch (error) { } catch (error) {
@ -244,6 +304,9 @@ export class VectorManager {
console.error('Error embedding chunks:', error) console.error('Error embedding chunks:', error)
throw error throw error
} }
} finally {
// 最终清理
this.forceGarbageCollection()
} }
} }
@ -252,125 +315,160 @@ export class VectorManager {
chunkSize: number, chunkSize: number,
file: TFile file: TFile
) { ) {
// Delete existing vectors for the files
await this.repository.deleteVectorsForSingleFile(
file.path,
embeddingModel,
)
// Embed the files
const textSplitter = RecursiveCharacterTextSplitter.fromLanguage(
'markdown',
{
chunkSize,
},
)
const fileContent = await this.app.vault.cachedRead(file)
const fileDocuments = await textSplitter.createDocuments([
fileContent,
])
const contentChunks: InsertVector[] = fileDocuments.map((chunk): InsertVector => {
return {
path: file.path,
mtime: file.stat.mtime,
content: chunk.pageContent,
embedding: [],
metadata: {
startLine: Number(chunk.metadata.loc.lines.from),
endLine: Number(chunk.metadata.loc.lines.to),
},
}
})
const embeddingChunks: InsertVector[] = []
const insertBatchSize = 64 // 数据库插入批量大小
try { try {
if (embeddingModel.supportsBatch) { // Delete existing vectors for the files
// 支持批量处理的提供商:使用批量处理逻辑 await this.repository.deleteVectorsForSingleFile(
const embeddingBatchSize = 64 // API批量处理大小 file.path,
embeddingModel,
)
for (let i = 0; i < contentChunks.length; i += embeddingBatchSize) { // Embed the files
console.log(`Embedding batch ${i / embeddingBatchSize + 1} of ${Math.ceil(contentChunks.length / embeddingBatchSize)}`) const textSplitter = RecursiveCharacterTextSplitter.fromLanguage(
const batchChunks = contentChunks.slice(i, Math.min(i + embeddingBatchSize, contentChunks.length)) 'markdown',
const batchTexts = batchChunks.map(chunk => chunk.content) {
chunkSize,
},
)
let fileContent = await this.app.vault.cachedRead(file)
// 清理null字节防止PostgreSQL UTF8编码错误
fileContent = fileContent.replace(/\0/g, '')
const fileDocuments = await textSplitter.createDocuments([
fileContent,
])
await backOff( const contentChunks: InsertVector[] = fileDocuments.map((chunk): InsertVector => {
async () => { return {
const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts) path: file.path,
mtime: file.stat.mtime,
// 合并embedding结果到chunk数据 content: chunk.pageContent.replace(/\0/g, ''), // 再次清理,确保安全
for (let j = 0; j < batchChunks.length; j++) { embedding: [],
const embeddedChunk: InsertVector = { metadata: {
path: batchChunks[j].path, startLine: Number(chunk.metadata.loc.lines.from),
mtime: batchChunks[j].mtime, endLine: Number(chunk.metadata.loc.lines.to),
content: batchChunks[j].content, },
embedding: batchEmbeddings[j],
metadata: batchChunks[j].metadata,
}
embeddingChunks.push(embeddedChunk)
}
},
{
numOfAttempts: 5,
startingDelay: 1000,
timeMultiple: 1.5,
jitter: 'full',
},
)
} }
} else { })
// 不支持批量处理的提供商:使用原来的逐个处理逻辑
const limit = pLimit(50) // 减少批量大小以降低内存压力
const abortController = new AbortController() const insertBatchSize = 16 // 从64降低到16
const tasks = contentChunks.map((chunk) => let batchCount = 0
limit(async () => {
try {
if (embeddingModel.supportsBatch) {
// 支持批量处理的提供商:使用流式处理逻辑
const embeddingBatchSize = 16 // 从64降低到16
for (let i = 0; i < contentChunks.length; i += embeddingBatchSize) {
batchCount++
console.log(`Embedding batch ${batchCount} of ${Math.ceil(contentChunks.length / embeddingBatchSize)}`)
const batchChunks = contentChunks.slice(i, Math.min(i + embeddingBatchSize, contentChunks.length))
const batchTexts = batchChunks.map(chunk => chunk.content)
const embeddedBatch: InsertVector[] = []
await backOff(
async () => {
const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts)
// 合并embedding结果到chunk数据
for (let j = 0; j < batchChunks.length; j++) {
const embeddedChunk: InsertVector = {
path: batchChunks[j].path,
mtime: batchChunks[j].mtime,
content: batchChunks[j].content,
embedding: batchEmbeddings[j],
metadata: batchChunks[j].metadata,
}
embeddedBatch.push(embeddedChunk)
}
},
{
numOfAttempts: 3, // 减少重试次数
startingDelay: 500, // 减少延迟
timeMultiple: 1.5,
jitter: 'full',
},
)
// 立即插入当前批次
if (embeddedBatch.length > 0) {
await this.repository.insertVectors(embeddedBatch, embeddingModel)
// 清理批次数据
embeddedBatch.length = 0
}
// 定期内存清理
await this.memoryCleanup(batchCount)
}
} else {
// 不支持批量处理的提供商:使用流式处理逻辑
const limit = pLimit(10) // 从50降低到10
const abortController = new AbortController()
// 流式处理:分批处理并立即插入
for (let i = 0; i < contentChunks.length; i += insertBatchSize) {
if (abortController.signal.aborted) { if (abortController.signal.aborted) {
throw new Error('Operation was aborted') throw new Error('Operation was aborted')
} }
try {
await backOff( batchCount++
async () => { const batchChunks = contentChunks.slice(i, Math.min(i + insertBatchSize, contentChunks.length))
const embedding = await embeddingModel.getEmbedding(chunk.content) const embeddedBatch: InsertVector[] = []
const embeddedChunk = {
path: chunk.path, const tasks = batchChunks.map((chunk) =>
mtime: chunk.mtime, limit(async () => {
content: chunk.content, if (abortController.signal.aborted) {
embedding, throw new Error('Operation was aborted')
metadata: chunk.metadata, }
} try {
embeddingChunks.push(embeddedChunk) await backOff(
}, async () => {
{ const embedding = await embeddingModel.getEmbedding(chunk.content)
numOfAttempts: 5, const embeddedChunk = {
startingDelay: 1000, path: chunk.path,
timeMultiple: 1.5, mtime: chunk.mtime,
jitter: 'full', content: chunk.content,
}, embedding,
) metadata: chunk.metadata,
} catch (error) { }
abortController.abort() embeddedBatch.push(embeddedChunk)
throw error },
{
numOfAttempts: 3, // 减少重试次数
startingDelay: 500, // 减少延迟
timeMultiple: 1.5,
jitter: 'full',
},
)
} catch (error) {
abortController.abort()
throw error
}
}),
)
await Promise.all(tasks)
// 立即插入当前批次
if (embeddedBatch.length > 0) {
await this.repository.insertVectors(embeddedBatch, embeddingModel)
// 清理批次数据
embeddedBatch.length = 0
} }
}),
)
await Promise.all(tasks) // 定期内存清理
} await this.memoryCleanup(batchCount)
}
// all embedding generated, batch insert
if (embeddingChunks.length > 0) {
let inserted = 0
while (inserted < embeddingChunks.length) {
const chunksToInsert = embeddingChunks.slice(inserted, Math.min(inserted + insertBatchSize, embeddingChunks.length))
await this.repository.insertVectors(chunksToInsert, embeddingModel)
inserted += chunksToInsert.length
} }
} catch (error) {
console.error('Error embedding chunks:', error)
} finally {
// 最终清理
this.forceGarbageCollection()
} }
} catch (error) { } catch (error) {
console.error('Error embedding chunks:', error) console.warn(`跳过文件 ${file.path}:`, error.message)
new Notice(`跳过文件 ${file.name}: ${error.message}`)
} }
} }
@ -424,25 +522,32 @@ export class VectorManager {
// Check for updated or new files // Check for updated or new files
filesToIndex = await Promise.all( filesToIndex = await Promise.all(
filesToIndex.map(async (file) => { filesToIndex.map(async (file) => {
const fileChunks = await this.repository.getVectorsByFilePath( try {
file.path, const fileChunks = await this.repository.getVectorsByFilePath(
embeddingModel, file.path,
) embeddingModel,
if (fileChunks.length === 0) { )
// File is not indexed, so we need to index it if (fileChunks.length === 0) {
const fileContent = await this.app.vault.cachedRead(file) // File is not indexed, so we need to index it
if (fileContent.length === 0) { let fileContent = await this.app.vault.cachedRead(file)
// Ignore empty files // 清理null字节防止PostgreSQL UTF8编码错误
return null fileContent = fileContent.replace(/\0/g, '')
if (fileContent.length === 0) {
// Ignore empty files
return null
}
return file
} }
return file const outOfDate = file.stat.mtime > fileChunks[0].mtime
if (outOfDate) {
// File has changed, so we need to re-index it
return file
}
return null
} catch (error) {
console.warn(`跳过文件 ${file.path}:`, error.message)
return null
} }
const outOfDate = file.stat.mtime > fileChunks[0].mtime
if (outOfDate) {
// File has changed, so we need to re-index it
return file
}
return null
}), }),
).then((files) => files.filter(Boolean)) ).then((files) => files.filter(Boolean))

View File

@ -102,7 +102,7 @@ export class VectorRepository {
const params = data.flatMap(vector => [ const params = data.flatMap(vector => [
vector.path, vector.path,
vector.mtime, vector.mtime,
vector.content, vector.content.replace(/\0/g, ''), // 清理null字节
`[${vector.embedding.join(',')}]`, // 转换为PostgreSQL vector格式 `[${vector.embedding.join(',')}]`, // 转换为PostgreSQL vector格式
vector.metadata vector.metadata
]) ])