update embeddings
This commit is contained in:
parent
c71a13a659
commit
f1ecc16c26
@ -258,6 +258,8 @@ export const InlineEdit: React.FC<InlineEditProps> = ({
|
|||||||
let fileContent: string;
|
let fileContent: string;
|
||||||
try {
|
try {
|
||||||
fileContent = await plugin.app.vault.cachedRead(activeFile);
|
fileContent = await plugin.app.vault.cachedRead(activeFile);
|
||||||
|
// 清理null字节,防止PostgreSQL UTF8编码错误
|
||||||
|
fileContent = fileContent.replace(/\0/g, '');
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
const error = err as Error;
|
const error = err as Error;
|
||||||
console.error(t("inlineEdit.readFileError"), error.message);
|
console.error(t("inlineEdit.readFileError"), error.message);
|
||||||
@ -278,7 +280,9 @@ export const InlineEdit: React.FC<InlineEditProps> = ({
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const oldContent = await plugin.app.vault.read(activeFile);
|
let oldContent = await plugin.app.vault.read(activeFile);
|
||||||
|
// 清理null字节,防止PostgreSQL UTF8编码错误
|
||||||
|
oldContent = oldContent.replace(/\0/g, '');
|
||||||
await plugin.app.workspace.getLeaf(true).setViewState({
|
await plugin.app.workspace.getLeaf(true).setViewState({
|
||||||
type: APPLY_VIEW_TYPE,
|
type: APPLY_VIEW_TYPE,
|
||||||
active: true,
|
active: true,
|
||||||
|
|||||||
@ -56,7 +56,9 @@ export async function matchSearchUsingCorePlugin(
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
const content = await vault.cachedRead(file as TFile);
|
let content = await vault.cachedRead(file as TFile);
|
||||||
|
// 清理null字节,防止PostgreSQL UTF8编码错误
|
||||||
|
content = content.replace(/\0/g, '');
|
||||||
const lines = content.split('\n');
|
const lines = content.split('\n');
|
||||||
|
|
||||||
// `fileMatches.result.content` holds an array of matches for the file.
|
// `fileMatches.result.content` holds an array of matches for the file.
|
||||||
|
|||||||
@ -52,6 +52,29 @@ export class VectorManager {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 强制垃圾回收的辅助方法
|
||||||
|
private forceGarbageCollection() {
|
||||||
|
try {
|
||||||
|
if (typeof global !== 'undefined' && global.gc) {
|
||||||
|
global.gc()
|
||||||
|
} else if (typeof window !== 'undefined' && (window as any).gc) {
|
||||||
|
(window as any).gc()
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
// 忽略垃圾回收错误
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 检查并清理内存的辅助方法
|
||||||
|
private async memoryCleanup(batchCount: number) {
|
||||||
|
// 每10批次强制垃圾回收
|
||||||
|
if (batchCount % 10 === 0) {
|
||||||
|
this.forceGarbageCollection()
|
||||||
|
// 短暂延迟让内存清理完成
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 100))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async updateVaultIndex(
|
async updateVaultIndex(
|
||||||
embeddingModel: EmbeddingModel,
|
embeddingModel: EmbeddingModel,
|
||||||
options: {
|
options: {
|
||||||
@ -100,29 +123,43 @@ export class VectorManager {
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const skippedFiles: string[] = []
|
||||||
const contentChunks: InsertVector[] = (
|
const contentChunks: InsertVector[] = (
|
||||||
await Promise.all(
|
await Promise.all(
|
||||||
filesToIndex.map(async (file) => {
|
filesToIndex.map(async (file) => {
|
||||||
const fileContent = await this.app.vault.cachedRead(file)
|
try {
|
||||||
const fileDocuments = await textSplitter.createDocuments([
|
let fileContent = await this.app.vault.cachedRead(file)
|
||||||
fileContent,
|
// 清理null字节,防止PostgreSQL UTF8编码错误
|
||||||
])
|
fileContent = fileContent.replace(/\0/g, '')
|
||||||
return fileDocuments.map((chunk): InsertVector => {
|
const fileDocuments = await textSplitter.createDocuments([
|
||||||
return {
|
fileContent,
|
||||||
path: file.path,
|
])
|
||||||
mtime: file.stat.mtime,
|
return fileDocuments.map((chunk): InsertVector => {
|
||||||
content: chunk.pageContent,
|
return {
|
||||||
embedding: [],
|
path: file.path,
|
||||||
metadata: {
|
mtime: file.stat.mtime,
|
||||||
startLine: Number(chunk.metadata.loc.lines.from),
|
content: chunk.pageContent.replace(/\0/g, ''), // 再次清理,确保安全
|
||||||
endLine: Number(chunk.metadata.loc.lines.to),
|
embedding: [],
|
||||||
},
|
metadata: {
|
||||||
}
|
startLine: Number(chunk.metadata.loc.lines.from),
|
||||||
})
|
endLine: Number(chunk.metadata.loc.lines.to),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
})
|
||||||
|
} catch (error) {
|
||||||
|
console.warn(`跳过文件 ${file.path}:`, error.message)
|
||||||
|
skippedFiles.push(file.path)
|
||||||
|
return []
|
||||||
|
}
|
||||||
}),
|
}),
|
||||||
)
|
)
|
||||||
).flat()
|
).flat()
|
||||||
|
|
||||||
|
if (skippedFiles.length > 0) {
|
||||||
|
console.warn(`跳过了 ${skippedFiles.length} 个有问题的文件:`, skippedFiles)
|
||||||
|
new Notice(`跳过了 ${skippedFiles.length} 个有问题的文件`)
|
||||||
|
}
|
||||||
|
|
||||||
updateProgress?.({
|
updateProgress?.({
|
||||||
completedChunks: 0,
|
completedChunks: 0,
|
||||||
totalChunks: contentChunks.length,
|
totalChunks: contentChunks.length,
|
||||||
@ -130,18 +167,22 @@ export class VectorManager {
|
|||||||
})
|
})
|
||||||
|
|
||||||
const embeddingProgress = { completed: 0 }
|
const embeddingProgress = { completed: 0 }
|
||||||
const embeddingChunks: InsertVector[] = []
|
// 减少批量大小以降低内存压力
|
||||||
const insertBatchSize = 64 // 数据库插入批量大小
|
const insertBatchSize = 16 // 从64降低到16
|
||||||
|
let batchCount = 0
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (embeddingModel.supportsBatch) {
|
if (embeddingModel.supportsBatch) {
|
||||||
// 支持批量处理的提供商:使用批量处理逻辑
|
// 支持批量处理的提供商:使用流式处理逻辑
|
||||||
const embeddingBatchSize = 64 // API批量处理大小
|
const embeddingBatchSize = 16 // 从64降低到16
|
||||||
|
|
||||||
for (let i = 0; i < contentChunks.length; i += embeddingBatchSize) {
|
for (let i = 0; i < contentChunks.length; i += embeddingBatchSize) {
|
||||||
|
batchCount++
|
||||||
const batchChunks = contentChunks.slice(i, Math.min(i + embeddingBatchSize, contentChunks.length))
|
const batchChunks = contentChunks.slice(i, Math.min(i + embeddingBatchSize, contentChunks.length))
|
||||||
const batchTexts = batchChunks.map(chunk => chunk.content)
|
const batchTexts = batchChunks.map(chunk => chunk.content)
|
||||||
|
|
||||||
|
const embeddedBatch: InsertVector[] = []
|
||||||
|
|
||||||
await backOff(
|
await backOff(
|
||||||
async () => {
|
async () => {
|
||||||
const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts)
|
const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts)
|
||||||
@ -155,80 +196,99 @@ export class VectorManager {
|
|||||||
embedding: batchEmbeddings[j],
|
embedding: batchEmbeddings[j],
|
||||||
metadata: batchChunks[j].metadata,
|
metadata: batchChunks[j].metadata,
|
||||||
}
|
}
|
||||||
embeddingChunks.push(embeddedChunk)
|
embeddedBatch.push(embeddedChunk)
|
||||||
}
|
}
|
||||||
|
|
||||||
embeddingProgress.completed += batchChunks.length
|
|
||||||
updateProgress?.({
|
|
||||||
completedChunks: embeddingProgress.completed,
|
|
||||||
totalChunks: contentChunks.length,
|
|
||||||
totalFiles: filesToIndex.length,
|
|
||||||
})
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
numOfAttempts: 5,
|
numOfAttempts: 3, // 减少重试次数
|
||||||
startingDelay: 1000,
|
startingDelay: 500, // 减少延迟
|
||||||
timeMultiple: 1.5,
|
timeMultiple: 1.5,
|
||||||
jitter: 'full',
|
jitter: 'full',
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// 立即插入当前批次,避免内存累积
|
||||||
|
if (embeddedBatch.length > 0) {
|
||||||
|
await this.repository.insertVectors(embeddedBatch, embeddingModel)
|
||||||
|
// 清理批次数据
|
||||||
|
embeddedBatch.length = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
embeddingProgress.completed += batchChunks.length
|
||||||
|
updateProgress?.({
|
||||||
|
completedChunks: embeddingProgress.completed,
|
||||||
|
totalChunks: contentChunks.length,
|
||||||
|
totalFiles: filesToIndex.length,
|
||||||
|
})
|
||||||
|
|
||||||
|
// 定期内存清理
|
||||||
|
await this.memoryCleanup(batchCount)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// 不支持批量处理的提供商:使用原来的逐个处理逻辑
|
// 不支持批量处理的提供商:使用流式处理逻辑
|
||||||
const limit = pLimit(50)
|
const limit = pLimit(10) // 从50降低到10,减少并发压力
|
||||||
const abortController = new AbortController()
|
const abortController = new AbortController()
|
||||||
const tasks = contentChunks.map((chunk) =>
|
|
||||||
limit(async () => {
|
|
||||||
if (abortController.signal.aborted) {
|
|
||||||
throw new Error('Operation was aborted')
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
await backOff(
|
|
||||||
async () => {
|
|
||||||
const embedding = await embeddingModel.getEmbedding(chunk.content)
|
|
||||||
const embeddedChunk = {
|
|
||||||
path: chunk.path,
|
|
||||||
mtime: chunk.mtime,
|
|
||||||
content: chunk.content,
|
|
||||||
embedding,
|
|
||||||
metadata: chunk.metadata,
|
|
||||||
}
|
|
||||||
embeddingChunks.push(embeddedChunk)
|
|
||||||
embeddingProgress.completed++
|
|
||||||
updateProgress?.({
|
|
||||||
completedChunks: embeddingProgress.completed,
|
|
||||||
totalChunks: contentChunks.length,
|
|
||||||
totalFiles: filesToIndex.length,
|
|
||||||
})
|
|
||||||
},
|
|
||||||
{
|
|
||||||
numOfAttempts: 5,
|
|
||||||
startingDelay: 1000,
|
|
||||||
timeMultiple: 1.5,
|
|
||||||
jitter: 'full',
|
|
||||||
},
|
|
||||||
)
|
|
||||||
} catch (error) {
|
|
||||||
abortController.abort()
|
|
||||||
throw error
|
|
||||||
}
|
|
||||||
}),
|
|
||||||
)
|
|
||||||
|
|
||||||
await Promise.all(tasks)
|
// 流式处理:分批处理并立即插入
|
||||||
}
|
for (let i = 0; i < contentChunks.length; i += insertBatchSize) {
|
||||||
|
if (abortController.signal.aborted) {
|
||||||
|
throw new Error('Operation was aborted')
|
||||||
|
}
|
||||||
|
|
||||||
// all embedding generated, batch insert
|
batchCount++
|
||||||
if (embeddingChunks.length > 0) {
|
const batchChunks = contentChunks.slice(i, Math.min(i + insertBatchSize, contentChunks.length))
|
||||||
// batch insert all vectors
|
const embeddedBatch: InsertVector[] = []
|
||||||
let inserted = 0
|
|
||||||
while (inserted < embeddingChunks.length) {
|
const tasks = batchChunks.map((chunk) =>
|
||||||
const chunksToInsert = embeddingChunks.slice(
|
limit(async () => {
|
||||||
inserted,
|
if (abortController.signal.aborted) {
|
||||||
Math.min(inserted + insertBatchSize, embeddingChunks.length)
|
throw new Error('Operation was aborted')
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
await backOff(
|
||||||
|
async () => {
|
||||||
|
const embedding = await embeddingModel.getEmbedding(chunk.content)
|
||||||
|
const embeddedChunk = {
|
||||||
|
path: chunk.path,
|
||||||
|
mtime: chunk.mtime,
|
||||||
|
content: chunk.content,
|
||||||
|
embedding,
|
||||||
|
metadata: chunk.metadata,
|
||||||
|
}
|
||||||
|
embeddedBatch.push(embeddedChunk)
|
||||||
|
},
|
||||||
|
{
|
||||||
|
numOfAttempts: 3, // 减少重试次数
|
||||||
|
startingDelay: 500, // 减少延迟
|
||||||
|
timeMultiple: 1.5,
|
||||||
|
jitter: 'full',
|
||||||
|
},
|
||||||
|
)
|
||||||
|
} catch (error) {
|
||||||
|
abortController.abort()
|
||||||
|
throw error
|
||||||
|
}
|
||||||
|
}),
|
||||||
)
|
)
|
||||||
await this.repository.insertVectors(chunksToInsert, embeddingModel)
|
|
||||||
inserted += chunksToInsert.length
|
await Promise.all(tasks)
|
||||||
|
|
||||||
|
// 立即插入当前批次
|
||||||
|
if (embeddedBatch.length > 0) {
|
||||||
|
await this.repository.insertVectors(embeddedBatch, embeddingModel)
|
||||||
|
// 清理批次数据
|
||||||
|
embeddedBatch.length = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
embeddingProgress.completed += batchChunks.length
|
||||||
|
updateProgress?.({
|
||||||
|
completedChunks: embeddingProgress.completed,
|
||||||
|
totalChunks: contentChunks.length,
|
||||||
|
totalFiles: filesToIndex.length,
|
||||||
|
})
|
||||||
|
|
||||||
|
// 定期内存清理
|
||||||
|
await this.memoryCleanup(batchCount)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
@ -244,6 +304,9 @@ export class VectorManager {
|
|||||||
console.error('Error embedding chunks:', error)
|
console.error('Error embedding chunks:', error)
|
||||||
throw error
|
throw error
|
||||||
}
|
}
|
||||||
|
} finally {
|
||||||
|
// 最终清理
|
||||||
|
this.forceGarbageCollection()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -252,125 +315,160 @@ export class VectorManager {
|
|||||||
chunkSize: number,
|
chunkSize: number,
|
||||||
file: TFile
|
file: TFile
|
||||||
) {
|
) {
|
||||||
|
|
||||||
// Delete existing vectors for the files
|
|
||||||
await this.repository.deleteVectorsForSingleFile(
|
|
||||||
file.path,
|
|
||||||
embeddingModel,
|
|
||||||
)
|
|
||||||
|
|
||||||
// Embed the files
|
|
||||||
const textSplitter = RecursiveCharacterTextSplitter.fromLanguage(
|
|
||||||
'markdown',
|
|
||||||
{
|
|
||||||
chunkSize,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
const fileContent = await this.app.vault.cachedRead(file)
|
|
||||||
const fileDocuments = await textSplitter.createDocuments([
|
|
||||||
fileContent,
|
|
||||||
])
|
|
||||||
|
|
||||||
const contentChunks: InsertVector[] = fileDocuments.map((chunk): InsertVector => {
|
|
||||||
return {
|
|
||||||
path: file.path,
|
|
||||||
mtime: file.stat.mtime,
|
|
||||||
content: chunk.pageContent,
|
|
||||||
embedding: [],
|
|
||||||
metadata: {
|
|
||||||
startLine: Number(chunk.metadata.loc.lines.from),
|
|
||||||
endLine: Number(chunk.metadata.loc.lines.to),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
const embeddingChunks: InsertVector[] = []
|
|
||||||
const insertBatchSize = 64 // 数据库插入批量大小
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (embeddingModel.supportsBatch) {
|
// Delete existing vectors for the files
|
||||||
// 支持批量处理的提供商:使用批量处理逻辑
|
await this.repository.deleteVectorsForSingleFile(
|
||||||
const embeddingBatchSize = 64 // API批量处理大小
|
file.path,
|
||||||
|
embeddingModel,
|
||||||
|
)
|
||||||
|
|
||||||
for (let i = 0; i < contentChunks.length; i += embeddingBatchSize) {
|
// Embed the files
|
||||||
console.log(`Embedding batch ${i / embeddingBatchSize + 1} of ${Math.ceil(contentChunks.length / embeddingBatchSize)}`)
|
const textSplitter = RecursiveCharacterTextSplitter.fromLanguage(
|
||||||
const batchChunks = contentChunks.slice(i, Math.min(i + embeddingBatchSize, contentChunks.length))
|
'markdown',
|
||||||
const batchTexts = batchChunks.map(chunk => chunk.content)
|
{
|
||||||
|
chunkSize,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
let fileContent = await this.app.vault.cachedRead(file)
|
||||||
|
// 清理null字节,防止PostgreSQL UTF8编码错误
|
||||||
|
fileContent = fileContent.replace(/\0/g, '')
|
||||||
|
const fileDocuments = await textSplitter.createDocuments([
|
||||||
|
fileContent,
|
||||||
|
])
|
||||||
|
|
||||||
await backOff(
|
const contentChunks: InsertVector[] = fileDocuments.map((chunk): InsertVector => {
|
||||||
async () => {
|
return {
|
||||||
const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts)
|
path: file.path,
|
||||||
|
mtime: file.stat.mtime,
|
||||||
// 合并embedding结果到chunk数据
|
content: chunk.pageContent.replace(/\0/g, ''), // 再次清理,确保安全
|
||||||
for (let j = 0; j < batchChunks.length; j++) {
|
embedding: [],
|
||||||
const embeddedChunk: InsertVector = {
|
metadata: {
|
||||||
path: batchChunks[j].path,
|
startLine: Number(chunk.metadata.loc.lines.from),
|
||||||
mtime: batchChunks[j].mtime,
|
endLine: Number(chunk.metadata.loc.lines.to),
|
||||||
content: batchChunks[j].content,
|
},
|
||||||
embedding: batchEmbeddings[j],
|
|
||||||
metadata: batchChunks[j].metadata,
|
|
||||||
}
|
|
||||||
embeddingChunks.push(embeddedChunk)
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
numOfAttempts: 5,
|
|
||||||
startingDelay: 1000,
|
|
||||||
timeMultiple: 1.5,
|
|
||||||
jitter: 'full',
|
|
||||||
},
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
} else {
|
})
|
||||||
// 不支持批量处理的提供商:使用原来的逐个处理逻辑
|
|
||||||
const limit = pLimit(50)
|
// 减少批量大小以降低内存压力
|
||||||
const abortController = new AbortController()
|
const insertBatchSize = 16 // 从64降低到16
|
||||||
const tasks = contentChunks.map((chunk) =>
|
let batchCount = 0
|
||||||
limit(async () => {
|
|
||||||
|
try {
|
||||||
|
if (embeddingModel.supportsBatch) {
|
||||||
|
// 支持批量处理的提供商:使用流式处理逻辑
|
||||||
|
const embeddingBatchSize = 16 // 从64降低到16
|
||||||
|
|
||||||
|
for (let i = 0; i < contentChunks.length; i += embeddingBatchSize) {
|
||||||
|
batchCount++
|
||||||
|
console.log(`Embedding batch ${batchCount} of ${Math.ceil(contentChunks.length / embeddingBatchSize)}`)
|
||||||
|
const batchChunks = contentChunks.slice(i, Math.min(i + embeddingBatchSize, contentChunks.length))
|
||||||
|
const batchTexts = batchChunks.map(chunk => chunk.content)
|
||||||
|
|
||||||
|
const embeddedBatch: InsertVector[] = []
|
||||||
|
|
||||||
|
await backOff(
|
||||||
|
async () => {
|
||||||
|
const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts)
|
||||||
|
|
||||||
|
// 合并embedding结果到chunk数据
|
||||||
|
for (let j = 0; j < batchChunks.length; j++) {
|
||||||
|
const embeddedChunk: InsertVector = {
|
||||||
|
path: batchChunks[j].path,
|
||||||
|
mtime: batchChunks[j].mtime,
|
||||||
|
content: batchChunks[j].content,
|
||||||
|
embedding: batchEmbeddings[j],
|
||||||
|
metadata: batchChunks[j].metadata,
|
||||||
|
}
|
||||||
|
embeddedBatch.push(embeddedChunk)
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
numOfAttempts: 3, // 减少重试次数
|
||||||
|
startingDelay: 500, // 减少延迟
|
||||||
|
timeMultiple: 1.5,
|
||||||
|
jitter: 'full',
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
// 立即插入当前批次
|
||||||
|
if (embeddedBatch.length > 0) {
|
||||||
|
await this.repository.insertVectors(embeddedBatch, embeddingModel)
|
||||||
|
// 清理批次数据
|
||||||
|
embeddedBatch.length = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// 定期内存清理
|
||||||
|
await this.memoryCleanup(batchCount)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// 不支持批量处理的提供商:使用流式处理逻辑
|
||||||
|
const limit = pLimit(10) // 从50降低到10
|
||||||
|
const abortController = new AbortController()
|
||||||
|
|
||||||
|
// 流式处理:分批处理并立即插入
|
||||||
|
for (let i = 0; i < contentChunks.length; i += insertBatchSize) {
|
||||||
if (abortController.signal.aborted) {
|
if (abortController.signal.aborted) {
|
||||||
throw new Error('Operation was aborted')
|
throw new Error('Operation was aborted')
|
||||||
}
|
}
|
||||||
try {
|
|
||||||
await backOff(
|
batchCount++
|
||||||
async () => {
|
const batchChunks = contentChunks.slice(i, Math.min(i + insertBatchSize, contentChunks.length))
|
||||||
const embedding = await embeddingModel.getEmbedding(chunk.content)
|
const embeddedBatch: InsertVector[] = []
|
||||||
const embeddedChunk = {
|
|
||||||
path: chunk.path,
|
const tasks = batchChunks.map((chunk) =>
|
||||||
mtime: chunk.mtime,
|
limit(async () => {
|
||||||
content: chunk.content,
|
if (abortController.signal.aborted) {
|
||||||
embedding,
|
throw new Error('Operation was aborted')
|
||||||
metadata: chunk.metadata,
|
}
|
||||||
}
|
try {
|
||||||
embeddingChunks.push(embeddedChunk)
|
await backOff(
|
||||||
},
|
async () => {
|
||||||
{
|
const embedding = await embeddingModel.getEmbedding(chunk.content)
|
||||||
numOfAttempts: 5,
|
const embeddedChunk = {
|
||||||
startingDelay: 1000,
|
path: chunk.path,
|
||||||
timeMultiple: 1.5,
|
mtime: chunk.mtime,
|
||||||
jitter: 'full',
|
content: chunk.content,
|
||||||
},
|
embedding,
|
||||||
)
|
metadata: chunk.metadata,
|
||||||
} catch (error) {
|
}
|
||||||
abortController.abort()
|
embeddedBatch.push(embeddedChunk)
|
||||||
throw error
|
},
|
||||||
|
{
|
||||||
|
numOfAttempts: 3, // 减少重试次数
|
||||||
|
startingDelay: 500, // 减少延迟
|
||||||
|
timeMultiple: 1.5,
|
||||||
|
jitter: 'full',
|
||||||
|
},
|
||||||
|
)
|
||||||
|
} catch (error) {
|
||||||
|
abortController.abort()
|
||||||
|
throw error
|
||||||
|
}
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
|
||||||
|
await Promise.all(tasks)
|
||||||
|
|
||||||
|
// 立即插入当前批次
|
||||||
|
if (embeddedBatch.length > 0) {
|
||||||
|
await this.repository.insertVectors(embeddedBatch, embeddingModel)
|
||||||
|
// 清理批次数据
|
||||||
|
embeddedBatch.length = 0
|
||||||
}
|
}
|
||||||
}),
|
|
||||||
)
|
|
||||||
|
|
||||||
await Promise.all(tasks)
|
// 定期内存清理
|
||||||
}
|
await this.memoryCleanup(batchCount)
|
||||||
|
}
|
||||||
// all embedding generated, batch insert
|
|
||||||
if (embeddingChunks.length > 0) {
|
|
||||||
let inserted = 0
|
|
||||||
while (inserted < embeddingChunks.length) {
|
|
||||||
const chunksToInsert = embeddingChunks.slice(inserted, Math.min(inserted + insertBatchSize, embeddingChunks.length))
|
|
||||||
await this.repository.insertVectors(chunksToInsert, embeddingModel)
|
|
||||||
inserted += chunksToInsert.length
|
|
||||||
}
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error embedding chunks:', error)
|
||||||
|
} finally {
|
||||||
|
// 最终清理
|
||||||
|
this.forceGarbageCollection()
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('Error embedding chunks:', error)
|
console.warn(`跳过文件 ${file.path}:`, error.message)
|
||||||
|
new Notice(`跳过文件 ${file.name}: ${error.message}`)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -424,25 +522,32 @@ export class VectorManager {
|
|||||||
// Check for updated or new files
|
// Check for updated or new files
|
||||||
filesToIndex = await Promise.all(
|
filesToIndex = await Promise.all(
|
||||||
filesToIndex.map(async (file) => {
|
filesToIndex.map(async (file) => {
|
||||||
const fileChunks = await this.repository.getVectorsByFilePath(
|
try {
|
||||||
file.path,
|
const fileChunks = await this.repository.getVectorsByFilePath(
|
||||||
embeddingModel,
|
file.path,
|
||||||
)
|
embeddingModel,
|
||||||
if (fileChunks.length === 0) {
|
)
|
||||||
// File is not indexed, so we need to index it
|
if (fileChunks.length === 0) {
|
||||||
const fileContent = await this.app.vault.cachedRead(file)
|
// File is not indexed, so we need to index it
|
||||||
if (fileContent.length === 0) {
|
let fileContent = await this.app.vault.cachedRead(file)
|
||||||
// Ignore empty files
|
// 清理null字节,防止PostgreSQL UTF8编码错误
|
||||||
return null
|
fileContent = fileContent.replace(/\0/g, '')
|
||||||
|
if (fileContent.length === 0) {
|
||||||
|
// Ignore empty files
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
return file
|
||||||
}
|
}
|
||||||
return file
|
const outOfDate = file.stat.mtime > fileChunks[0].mtime
|
||||||
|
if (outOfDate) {
|
||||||
|
// File has changed, so we need to re-index it
|
||||||
|
return file
|
||||||
|
}
|
||||||
|
return null
|
||||||
|
} catch (error) {
|
||||||
|
console.warn(`跳过文件 ${file.path}:`, error.message)
|
||||||
|
return null
|
||||||
}
|
}
|
||||||
const outOfDate = file.stat.mtime > fileChunks[0].mtime
|
|
||||||
if (outOfDate) {
|
|
||||||
// File has changed, so we need to re-index it
|
|
||||||
return file
|
|
||||||
}
|
|
||||||
return null
|
|
||||||
}),
|
}),
|
||||||
).then((files) => files.filter(Boolean))
|
).then((files) => files.filter(Boolean))
|
||||||
|
|
||||||
|
|||||||
@ -102,7 +102,7 @@ export class VectorRepository {
|
|||||||
const params = data.flatMap(vector => [
|
const params = data.flatMap(vector => [
|
||||||
vector.path,
|
vector.path,
|
||||||
vector.mtime,
|
vector.mtime,
|
||||||
vector.content,
|
vector.content.replace(/\0/g, ''), // 清理null字节
|
||||||
`[${vector.embedding.join(',')}]`, // 转换为PostgreSQL vector格式
|
`[${vector.embedding.join(',')}]`, // 转换为PostgreSQL vector格式
|
||||||
vector.metadata
|
vector.metadata
|
||||||
])
|
])
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user