update vector manager

This commit is contained in:
duanfuxiang 2025-07-13 07:02:54 +08:00
parent c1fbd4da21
commit 34296e6871
6 changed files with 1104 additions and 435 deletions

236
pnpm-lock.yaml generated
View File

@ -123,6 +123,9 @@ importers:
handlebars:
specifier: ^4.7.7
version: 4.7.8
jieba-wasm:
specifier: ^2.2.0
version: 2.2.0
js-tiktoken:
specifier: ^1.0.15
version: 1.0.20
@ -583,9 +586,9 @@ packages:
'@codemirror/language@6.11.2':
resolution: {integrity: sha512-p44TsNArL4IVXDTbapUmEkAlvWs2CFQbcfc0ymDsis1kH2wh0gcY96AS29c/vp2d0y2Tquk1EDSaawpzilUiAw==}
'@codemirror/language@https://codeload.github.com/lishid/cm-language/tar.gz/6c1c5f5b677f6f6503d1ca2ec47f62f6406cda67':
resolution: {tarball: https://codeload.github.com/lishid/cm-language/tar.gz/6c1c5f5b677f6f6503d1ca2ec47f62f6406cda67}
version: 6.10.8
'@codemirror/language@https://codeload.github.com/lishid/cm-language/tar.gz/a9c3c7efe17dd1d24395ee2a179fe12dd6ed1e76':
resolution: {tarball: https://codeload.github.com/lishid/cm-language/tar.gz/a9c3c7efe17dd1d24395ee2a179fe12dd6ed1e76}
version: 6.11.2
'@codemirror/lint@0.20.3':
resolution: {integrity: sha512-06xUScbbspZ8mKoODQCEx6hz1bjaq9m8W8DxdycWARMiiX1wMtfCh/MoHpaL7ws/KUMwlsFFfp2qhm32oaCvVA==}
@ -669,8 +672,8 @@ packages:
cpu: [ppc64]
os: [aix]
'@esbuild/aix-ppc64@0.25.5':
resolution: {integrity: sha512-9o3TMmpmftaCMepOdA5k/yDw8SfInyzWWTjYTFCX3kPSDJMROQTb8jg+h9Cnwnmm1vOzvxN7gIfB5V2ewpjtGA==}
'@esbuild/aix-ppc64@0.25.6':
resolution: {integrity: sha512-ShbM/3XxwuxjFiuVBHA+d3j5dyac0aEVVq1oluIDf71hUw0aRF59dV/efUsIwFnR6m8JNM2FjZOzmaZ8yG61kw==}
engines: {node: '>=18'}
cpu: [ppc64]
os: [aix]
@ -693,8 +696,8 @@ packages:
cpu: [arm64]
os: [android]
'@esbuild/android-arm64@0.25.5':
resolution: {integrity: sha512-VGzGhj4lJO+TVGV1v8ntCZWJktV7SGCs3Pn1GRWI1SBFtRALoomm8k5E9Pmwg3HOAal2VDc2F9+PM/rEY6oIDg==}
'@esbuild/android-arm64@0.25.6':
resolution: {integrity: sha512-hd5zdUarsK6strW+3Wxi5qWws+rJhCCbMiC9QZyzoxfk5uHRIE8T287giQxzVpEvCwuJ9Qjg6bEjcRJcgfLqoA==}
engines: {node: '>=18'}
cpu: [arm64]
os: [android]
@ -717,8 +720,8 @@ packages:
cpu: [arm]
os: [android]
'@esbuild/android-arm@0.25.5':
resolution: {integrity: sha512-AdJKSPeEHgi7/ZhuIPtcQKr5RQdo6OO2IL87JkianiMYMPbCtot9fxPbrMiBADOWWm3T2si9stAiVsGbTQFkbA==}
'@esbuild/android-arm@0.25.6':
resolution: {integrity: sha512-S8ToEOVfg++AU/bHwdksHNnyLyVM+eMVAOf6yRKFitnwnbwwPNqKr3srzFRe7nzV69RQKb5DgchIX5pt3L53xg==}
engines: {node: '>=18'}
cpu: [arm]
os: [android]
@ -741,8 +744,8 @@ packages:
cpu: [x64]
os: [android]
'@esbuild/android-x64@0.25.5':
resolution: {integrity: sha512-D2GyJT1kjvO//drbRT3Hib9XPwQeWd9vZoBJn+bu/lVsOZ13cqNdDeqIF/xQ5/VmWvMduP6AmXvylO/PIc2isw==}
'@esbuild/android-x64@0.25.6':
resolution: {integrity: sha512-0Z7KpHSr3VBIO9A/1wcT3NTy7EB4oNC4upJ5ye3R7taCc2GUdeynSLArnon5G8scPwaU866d3H4BCrE5xLW25A==}
engines: {node: '>=18'}
cpu: [x64]
os: [android]
@ -765,8 +768,8 @@ packages:
cpu: [arm64]
os: [darwin]
'@esbuild/darwin-arm64@0.25.5':
resolution: {integrity: sha512-GtaBgammVvdF7aPIgH2jxMDdivezgFu6iKpmT+48+F8Hhg5J/sfnDieg0aeG/jfSvkYQU2/pceFPDKlqZzwnfQ==}
'@esbuild/darwin-arm64@0.25.6':
resolution: {integrity: sha512-FFCssz3XBavjxcFxKsGy2DYK5VSvJqa6y5HXljKzhRZ87LvEi13brPrf/wdyl/BbpbMKJNOr1Sd0jtW4Ge1pAA==}
engines: {node: '>=18'}
cpu: [arm64]
os: [darwin]
@ -789,8 +792,8 @@ packages:
cpu: [x64]
os: [darwin]
'@esbuild/darwin-x64@0.25.5':
resolution: {integrity: sha512-1iT4FVL0dJ76/q1wd7XDsXrSW+oLoquptvh4CLR4kITDtqi2e/xwXwdCVH8hVHU43wgJdsq7Gxuzcs6Iq/7bxQ==}
'@esbuild/darwin-x64@0.25.6':
resolution: {integrity: sha512-GfXs5kry/TkGM2vKqK2oyiLFygJRqKVhawu3+DOCk7OxLy/6jYkWXhlHwOoTb0WqGnWGAS7sooxbZowy+pK9Yg==}
engines: {node: '>=18'}
cpu: [x64]
os: [darwin]
@ -813,8 +816,8 @@ packages:
cpu: [arm64]
os: [freebsd]
'@esbuild/freebsd-arm64@0.25.5':
resolution: {integrity: sha512-nk4tGP3JThz4La38Uy/gzyXtpkPW8zSAmoUhK9xKKXdBCzKODMc2adkB2+8om9BDYugz+uGV7sLmpTYzvmz6Sw==}
'@esbuild/freebsd-arm64@0.25.6':
resolution: {integrity: sha512-aoLF2c3OvDn2XDTRvn8hN6DRzVVpDlj2B/F66clWd/FHLiHaG3aVZjxQX2DYphA5y/evbdGvC6Us13tvyt4pWg==}
engines: {node: '>=18'}
cpu: [arm64]
os: [freebsd]
@ -837,8 +840,8 @@ packages:
cpu: [x64]
os: [freebsd]
'@esbuild/freebsd-x64@0.25.5':
resolution: {integrity: sha512-PrikaNjiXdR2laW6OIjlbeuCPrPaAl0IwPIaRv+SMV8CiM8i2LqVUHFC1+8eORgWyY7yhQY+2U2fA55mBzReaw==}
'@esbuild/freebsd-x64@0.25.6':
resolution: {integrity: sha512-2SkqTjTSo2dYi/jzFbU9Plt1vk0+nNg8YC8rOXXea+iA3hfNJWebKYPs3xnOUf9+ZWhKAaxnQNUf2X9LOpeiMQ==}
engines: {node: '>=18'}
cpu: [x64]
os: [freebsd]
@ -861,8 +864,8 @@ packages:
cpu: [arm64]
os: [linux]
'@esbuild/linux-arm64@0.25.5':
resolution: {integrity: sha512-Z9kfb1v6ZlGbWj8EJk9T6czVEjjq2ntSYLY2cw6pAZl4oKtfgQuS4HOq41M/BcoLPzrUbNd+R4BXFyH//nHxVg==}
'@esbuild/linux-arm64@0.25.6':
resolution: {integrity: sha512-b967hU0gqKd9Drsh/UuAm21Khpoh6mPBSgz8mKRq4P5mVK8bpA+hQzmm/ZwGVULSNBzKdZPQBRT3+WuVavcWsQ==}
engines: {node: '>=18'}
cpu: [arm64]
os: [linux]
@ -885,8 +888,8 @@ packages:
cpu: [arm]
os: [linux]
'@esbuild/linux-arm@0.25.5':
resolution: {integrity: sha512-cPzojwW2okgh7ZlRpcBEtsX7WBuqbLrNXqLU89GxWbNt6uIg78ET82qifUy3W6OVww6ZWobWub5oqZOVtwolfw==}
'@esbuild/linux-arm@0.25.6':
resolution: {integrity: sha512-SZHQlzvqv4Du5PrKE2faN0qlbsaW/3QQfUUc6yO2EjFcA83xnwm91UbEEVx4ApZ9Z5oG8Bxz4qPE+HFwtVcfyw==}
engines: {node: '>=18'}
cpu: [arm]
os: [linux]
@ -909,8 +912,8 @@ packages:
cpu: [ia32]
os: [linux]
'@esbuild/linux-ia32@0.25.5':
resolution: {integrity: sha512-sQ7l00M8bSv36GLV95BVAdhJ2QsIbCuCjh/uYrWiMQSUuV+LpXwIqhgJDcvMTj+VsQmqAHL2yYaasENvJ7CDKA==}
'@esbuild/linux-ia32@0.25.6':
resolution: {integrity: sha512-aHWdQ2AAltRkLPOsKdi3xv0mZ8fUGPdlKEjIEhxCPm5yKEThcUjHpWB1idN74lfXGnZ5SULQSgtr5Qos5B0bPw==}
engines: {node: '>=18'}
cpu: [ia32]
os: [linux]
@ -933,8 +936,8 @@ packages:
cpu: [loong64]
os: [linux]
'@esbuild/linux-loong64@0.25.5':
resolution: {integrity: sha512-0ur7ae16hDUC4OL5iEnDb0tZHDxYmuQyhKhsPBV8f99f6Z9KQM02g33f93rNH5A30agMS46u2HP6qTdEt6Q1kg==}
'@esbuild/linux-loong64@0.25.6':
resolution: {integrity: sha512-VgKCsHdXRSQ7E1+QXGdRPlQ/e08bN6WMQb27/TMfV+vPjjTImuT9PmLXupRlC90S1JeNNW5lzkAEO/McKeJ2yg==}
engines: {node: '>=18'}
cpu: [loong64]
os: [linux]
@ -957,8 +960,8 @@ packages:
cpu: [mips64el]
os: [linux]
'@esbuild/linux-mips64el@0.25.5':
resolution: {integrity: sha512-kB/66P1OsHO5zLz0i6X0RxlQ+3cu0mkxS3TKFvkb5lin6uwZ/ttOkP3Z8lfR9mJOBk14ZwZ9182SIIWFGNmqmg==}
'@esbuild/linux-mips64el@0.25.6':
resolution: {integrity: sha512-WViNlpivRKT9/py3kCmkHnn44GkGXVdXfdc4drNmRl15zVQ2+D2uFwdlGh6IuK5AAnGTo2qPB1Djppj+t78rzw==}
engines: {node: '>=18'}
cpu: [mips64el]
os: [linux]
@ -981,8 +984,8 @@ packages:
cpu: [ppc64]
os: [linux]
'@esbuild/linux-ppc64@0.25.5':
resolution: {integrity: sha512-UZCmJ7r9X2fe2D6jBmkLBMQetXPXIsZjQJCjgwpVDz+YMcS6oFR27alkgGv3Oqkv07bxdvw7fyB71/olceJhkQ==}
'@esbuild/linux-ppc64@0.25.6':
resolution: {integrity: sha512-wyYKZ9NTdmAMb5730I38lBqVu6cKl4ZfYXIs31Baf8aoOtB4xSGi3THmDYt4BTFHk7/EcVixkOV2uZfwU3Q2Jw==}
engines: {node: '>=18'}
cpu: [ppc64]
os: [linux]
@ -1005,8 +1008,8 @@ packages:
cpu: [riscv64]
os: [linux]
'@esbuild/linux-riscv64@0.25.5':
resolution: {integrity: sha512-kTxwu4mLyeOlsVIFPfQo+fQJAV9mh24xL+y+Bm6ej067sYANjyEw1dNHmvoqxJUCMnkBdKpvOn0Ahql6+4VyeA==}
'@esbuild/linux-riscv64@0.25.6':
resolution: {integrity: sha512-KZh7bAGGcrinEj4qzilJ4hqTY3Dg2U82c8bv+e1xqNqZCrCyc+TL9AUEn5WGKDzm3CfC5RODE/qc96OcbIe33w==}
engines: {node: '>=18'}
cpu: [riscv64]
os: [linux]
@ -1029,8 +1032,8 @@ packages:
cpu: [s390x]
os: [linux]
'@esbuild/linux-s390x@0.25.5':
resolution: {integrity: sha512-K2dSKTKfmdh78uJ3NcWFiqyRrimfdinS5ErLSn3vluHNeHVnBAFWC8a4X5N+7FgVE1EjXS1QDZbpqZBjfrqMTQ==}
'@esbuild/linux-s390x@0.25.6':
resolution: {integrity: sha512-9N1LsTwAuE9oj6lHMyyAM+ucxGiVnEqUdp4v7IaMmrwb06ZTEVCIs3oPPplVsnjPfyjmxwHxHMF8b6vzUVAUGw==}
engines: {node: '>=18'}
cpu: [s390x]
os: [linux]
@ -1053,14 +1056,14 @@ packages:
cpu: [x64]
os: [linux]
'@esbuild/linux-x64@0.25.5':
resolution: {integrity: sha512-uhj8N2obKTE6pSZ+aMUbqq+1nXxNjZIIjCjGLfsWvVpy7gKCOL6rsY1MhRh9zLtUtAI7vpgLMK6DxjO8Qm9lJw==}
'@esbuild/linux-x64@0.25.6':
resolution: {integrity: sha512-A6bJB41b4lKFWRKNrWoP2LHsjVzNiaurf7wyj/XtFNTsnPuxwEBWHLty+ZE0dWBKuSK1fvKgrKaNjBS7qbFKig==}
engines: {node: '>=18'}
cpu: [x64]
os: [linux]
'@esbuild/netbsd-arm64@0.25.5':
resolution: {integrity: sha512-pwHtMP9viAy1oHPvgxtOv+OkduK5ugofNTVDilIzBLpoWAM16r7b/mxBvfpuQDpRQFMfuVr5aLcn4yveGvBZvw==}
'@esbuild/netbsd-arm64@0.25.6':
resolution: {integrity: sha512-IjA+DcwoVpjEvyxZddDqBY+uJ2Snc6duLpjmkXm/v4xuS3H+3FkLZlDm9ZsAbF9rsfP3zeA0/ArNDORZgrxR/Q==}
engines: {node: '>=18'}
cpu: [arm64]
os: [netbsd]
@ -1083,14 +1086,14 @@ packages:
cpu: [x64]
os: [netbsd]
'@esbuild/netbsd-x64@0.25.5':
resolution: {integrity: sha512-WOb5fKrvVTRMfWFNCroYWWklbnXH0Q5rZppjq0vQIdlsQKuw6mdSihwSo4RV/YdQ5UCKKvBy7/0ZZYLBZKIbwQ==}
'@esbuild/netbsd-x64@0.25.6':
resolution: {integrity: sha512-dUXuZr5WenIDlMHdMkvDc1FAu4xdWixTCRgP7RQLBOkkGgwuuzaGSYcOpW4jFxzpzL1ejb8yF620UxAqnBrR9g==}
engines: {node: '>=18'}
cpu: [x64]
os: [netbsd]
'@esbuild/openbsd-arm64@0.25.5':
resolution: {integrity: sha512-7A208+uQKgTxHd0G0uqZO8UjK2R0DDb4fDmERtARjSHWxqMTye4Erz4zZafx7Di9Cv+lNHYuncAkiGFySoD+Mw==}
'@esbuild/openbsd-arm64@0.25.6':
resolution: {integrity: sha512-l8ZCvXP0tbTJ3iaqdNf3pjaOSd5ex/e6/omLIQCVBLmHTlfXW3zAxQ4fnDmPLOB1x9xrcSi/xtCWFwCZRIaEwg==}
engines: {node: '>=18'}
cpu: [arm64]
os: [openbsd]
@ -1113,12 +1116,18 @@ packages:
cpu: [x64]
os: [openbsd]
'@esbuild/openbsd-x64@0.25.5':
resolution: {integrity: sha512-G4hE405ErTWraiZ8UiSoesH8DaCsMm0Cay4fsFWOOUcz8b8rC6uCvnagr+gnioEjWn0wC+o1/TAHt+It+MpIMg==}
'@esbuild/openbsd-x64@0.25.6':
resolution: {integrity: sha512-hKrmDa0aOFOr71KQ/19JC7az1P0GWtCN1t2ahYAf4O007DHZt/dW8ym5+CUdJhQ/qkZmI1HAF8KkJbEFtCL7gw==}
engines: {node: '>=18'}
cpu: [x64]
os: [openbsd]
'@esbuild/openharmony-arm64@0.25.6':
resolution: {integrity: sha512-+SqBcAWoB1fYKmpWoQP4pGtx+pUUC//RNYhFdbcSA16617cchuryuhOCRpPsjCblKukAckWsV+aQ3UKT/RMPcA==}
engines: {node: '>=18'}
cpu: [arm64]
os: [openharmony]
'@esbuild/sunos-x64@0.17.3':
resolution: {integrity: sha512-RxmhKLbTCDAY2xOfrww6ieIZkZF+KBqG7S2Ako2SljKXRFi+0863PspK74QQ7JpmWwncChY25JTJSbVBYGQk2Q==}
engines: {node: '>=12'}
@ -1137,8 +1146,8 @@ packages:
cpu: [x64]
os: [sunos]
'@esbuild/sunos-x64@0.25.5':
resolution: {integrity: sha512-l+azKShMy7FxzY0Rj4RCt5VD/q8mG/e+mDivgspo+yL8zW7qEwctQ6YqKX34DTEleFAvCIUviCFX1SDZRSyMQA==}
'@esbuild/sunos-x64@0.25.6':
resolution: {integrity: sha512-dyCGxv1/Br7MiSC42qinGL8KkG4kX0pEsdb0+TKhmJZgCUDBGmyo1/ArCjNGiOLiIAgdbWgmWgib4HoCi5t7kA==}
engines: {node: '>=18'}
cpu: [x64]
os: [sunos]
@ -1161,8 +1170,8 @@ packages:
cpu: [arm64]
os: [win32]
'@esbuild/win32-arm64@0.25.5':
resolution: {integrity: sha512-O2S7SNZzdcFG7eFKgvwUEZ2VG9D/sn/eIiz8XRZ1Q/DO5a3s76Xv0mdBzVM5j5R639lXQmPmSo0iRpHqUUrsxw==}
'@esbuild/win32-arm64@0.25.6':
resolution: {integrity: sha512-42QOgcZeZOvXfsCBJF5Afw73t4veOId//XD3i+/9gSkhSV6Gk3VPlWncctI+JcOyERv85FUo7RxuxGy+z8A43Q==}
engines: {node: '>=18'}
cpu: [arm64]
os: [win32]
@ -1185,8 +1194,8 @@ packages:
cpu: [ia32]
os: [win32]
'@esbuild/win32-ia32@0.25.5':
resolution: {integrity: sha512-onOJ02pqs9h1iMJ1PQphR+VZv8qBMQ77Klcsqv9CNW2w6yLqoURLcgERAIurY6QE63bbLuqgP9ATqajFLK5AMQ==}
'@esbuild/win32-ia32@0.25.6':
resolution: {integrity: sha512-4AWhgXmDuYN7rJI6ORB+uU9DHLq/erBbuMoAuB4VWJTu5KtCgcKYPynF0YI1VkBNuEfjNlLrFr9KZPJzrtLkrQ==}
engines: {node: '>=18'}
cpu: [ia32]
os: [win32]
@ -1209,8 +1218,8 @@ packages:
cpu: [x64]
os: [win32]
'@esbuild/win32-x64@0.25.5':
resolution: {integrity: sha512-TXv6YnJ8ZMVdX+SXWVBo/0p8LTcrUYngpWjvm91TMjjBQii7Oz11Lw5lbDV5Y0TzuhSJHwiH4hEtC1I42mMS0g==}
'@esbuild/win32-x64@0.25.6':
resolution: {integrity: sha512-NgJPHHbEpLQgDH2MjQu90pzW/5vvXIZ7KOnPyNBm92A6WgZ/7b6fJyUBjoumLqeOQQGqY2QjQxRo97ah4Sj0cA==}
engines: {node: '>=18'}
cpu: [x64]
os: [win32]
@ -3739,8 +3748,8 @@ packages:
engines: {node: '>=12'}
hasBin: true
esbuild@0.25.5:
resolution: {integrity: sha512-P8OtKZRv/5J5hhz0cUAdu/cLuPIKXpQl1R9pZtvmHWQvrAUVd0UNIPT4IB4W3rNOqVO0rlqHmCIbSwxh/c9yUQ==}
esbuild@0.25.6:
resolution: {integrity: sha512-GVuzuUwtdsghE3ocJ9Bs8PNoF13HNQ5TXbEi2AhvVb8xU1Iwt9Fos9FEamfoee+u/TOsn7GUWc04lz46n2bbTg==}
engines: {node: '>=18'}
hasBin: true
@ -4723,6 +4732,9 @@ packages:
node-notifier:
optional: true
jieba-wasm@2.2.0:
resolution: {integrity: sha512-IwxgUf+EMutjLair3k41i0ApM33qeHNY9EFBKlI5/XtHcISkGt5YPmUvpDJe3hUflwRYhy9g29ZzTetGZw6XgQ==}
js-base64@3.7.7:
resolution: {integrity: sha512-7rCnleh0z2CkXhH67J8K1Ytz0b2Y+yxTPL+/KOJoa20hfnVQ/3/T6W/KflYI4bRHRagNeXeU2bkNGI3v1oS/lw==}
@ -7089,7 +7101,7 @@ snapshots:
'@lezer/lr': 1.4.2
style-mod: 4.1.2
'@codemirror/language@https://codeload.github.com/lishid/cm-language/tar.gz/6c1c5f5b677f6f6503d1ca2ec47f62f6406cda67':
'@codemirror/language@https://codeload.github.com/lishid/cm-language/tar.gz/a9c3c7efe17dd1d24395ee2a179fe12dd6ed1e76':
dependencies:
'@codemirror/state': 6.5.2
'@codemirror/view': 6.38.0
@ -7190,7 +7202,7 @@ snapshots:
'@esbuild/aix-ppc64@0.19.12':
optional: true
'@esbuild/aix-ppc64@0.25.5':
'@esbuild/aix-ppc64@0.25.6':
optional: true
'@esbuild/android-arm64@0.17.3':
@ -7202,7 +7214,7 @@ snapshots:
'@esbuild/android-arm64@0.19.12':
optional: true
'@esbuild/android-arm64@0.25.5':
'@esbuild/android-arm64@0.25.6':
optional: true
'@esbuild/android-arm@0.17.3':
@ -7214,7 +7226,7 @@ snapshots:
'@esbuild/android-arm@0.19.12':
optional: true
'@esbuild/android-arm@0.25.5':
'@esbuild/android-arm@0.25.6':
optional: true
'@esbuild/android-x64@0.17.3':
@ -7226,7 +7238,7 @@ snapshots:
'@esbuild/android-x64@0.19.12':
optional: true
'@esbuild/android-x64@0.25.5':
'@esbuild/android-x64@0.25.6':
optional: true
'@esbuild/darwin-arm64@0.17.3':
@ -7238,7 +7250,7 @@ snapshots:
'@esbuild/darwin-arm64@0.19.12':
optional: true
'@esbuild/darwin-arm64@0.25.5':
'@esbuild/darwin-arm64@0.25.6':
optional: true
'@esbuild/darwin-x64@0.17.3':
@ -7250,7 +7262,7 @@ snapshots:
'@esbuild/darwin-x64@0.19.12':
optional: true
'@esbuild/darwin-x64@0.25.5':
'@esbuild/darwin-x64@0.25.6':
optional: true
'@esbuild/freebsd-arm64@0.17.3':
@ -7262,7 +7274,7 @@ snapshots:
'@esbuild/freebsd-arm64@0.19.12':
optional: true
'@esbuild/freebsd-arm64@0.25.5':
'@esbuild/freebsd-arm64@0.25.6':
optional: true
'@esbuild/freebsd-x64@0.17.3':
@ -7274,7 +7286,7 @@ snapshots:
'@esbuild/freebsd-x64@0.19.12':
optional: true
'@esbuild/freebsd-x64@0.25.5':
'@esbuild/freebsd-x64@0.25.6':
optional: true
'@esbuild/linux-arm64@0.17.3':
@ -7286,7 +7298,7 @@ snapshots:
'@esbuild/linux-arm64@0.19.12':
optional: true
'@esbuild/linux-arm64@0.25.5':
'@esbuild/linux-arm64@0.25.6':
optional: true
'@esbuild/linux-arm@0.17.3':
@ -7298,7 +7310,7 @@ snapshots:
'@esbuild/linux-arm@0.19.12':
optional: true
'@esbuild/linux-arm@0.25.5':
'@esbuild/linux-arm@0.25.6':
optional: true
'@esbuild/linux-ia32@0.17.3':
@ -7310,7 +7322,7 @@ snapshots:
'@esbuild/linux-ia32@0.19.12':
optional: true
'@esbuild/linux-ia32@0.25.5':
'@esbuild/linux-ia32@0.25.6':
optional: true
'@esbuild/linux-loong64@0.17.3':
@ -7322,7 +7334,7 @@ snapshots:
'@esbuild/linux-loong64@0.19.12':
optional: true
'@esbuild/linux-loong64@0.25.5':
'@esbuild/linux-loong64@0.25.6':
optional: true
'@esbuild/linux-mips64el@0.17.3':
@ -7334,7 +7346,7 @@ snapshots:
'@esbuild/linux-mips64el@0.19.12':
optional: true
'@esbuild/linux-mips64el@0.25.5':
'@esbuild/linux-mips64el@0.25.6':
optional: true
'@esbuild/linux-ppc64@0.17.3':
@ -7346,7 +7358,7 @@ snapshots:
'@esbuild/linux-ppc64@0.19.12':
optional: true
'@esbuild/linux-ppc64@0.25.5':
'@esbuild/linux-ppc64@0.25.6':
optional: true
'@esbuild/linux-riscv64@0.17.3':
@ -7358,7 +7370,7 @@ snapshots:
'@esbuild/linux-riscv64@0.19.12':
optional: true
'@esbuild/linux-riscv64@0.25.5':
'@esbuild/linux-riscv64@0.25.6':
optional: true
'@esbuild/linux-s390x@0.17.3':
@ -7370,7 +7382,7 @@ snapshots:
'@esbuild/linux-s390x@0.19.12':
optional: true
'@esbuild/linux-s390x@0.25.5':
'@esbuild/linux-s390x@0.25.6':
optional: true
'@esbuild/linux-x64@0.17.3':
@ -7382,10 +7394,10 @@ snapshots:
'@esbuild/linux-x64@0.19.12':
optional: true
'@esbuild/linux-x64@0.25.5':
'@esbuild/linux-x64@0.25.6':
optional: true
'@esbuild/netbsd-arm64@0.25.5':
'@esbuild/netbsd-arm64@0.25.6':
optional: true
'@esbuild/netbsd-x64@0.17.3':
@ -7397,10 +7409,10 @@ snapshots:
'@esbuild/netbsd-x64@0.19.12':
optional: true
'@esbuild/netbsd-x64@0.25.5':
'@esbuild/netbsd-x64@0.25.6':
optional: true
'@esbuild/openbsd-arm64@0.25.5':
'@esbuild/openbsd-arm64@0.25.6':
optional: true
'@esbuild/openbsd-x64@0.17.3':
@ -7412,7 +7424,10 @@ snapshots:
'@esbuild/openbsd-x64@0.19.12':
optional: true
'@esbuild/openbsd-x64@0.25.5':
'@esbuild/openbsd-x64@0.25.6':
optional: true
'@esbuild/openharmony-arm64@0.25.6':
optional: true
'@esbuild/sunos-x64@0.17.3':
@ -7424,7 +7439,7 @@ snapshots:
'@esbuild/sunos-x64@0.19.12':
optional: true
'@esbuild/sunos-x64@0.25.5':
'@esbuild/sunos-x64@0.25.6':
optional: true
'@esbuild/win32-arm64@0.17.3':
@ -7436,7 +7451,7 @@ snapshots:
'@esbuild/win32-arm64@0.19.12':
optional: true
'@esbuild/win32-arm64@0.25.5':
'@esbuild/win32-arm64@0.25.6':
optional: true
'@esbuild/win32-ia32@0.17.3':
@ -7448,7 +7463,7 @@ snapshots:
'@esbuild/win32-ia32@0.19.12':
optional: true
'@esbuild/win32-ia32@0.25.5':
'@esbuild/win32-ia32@0.25.6':
optional: true
'@esbuild/win32-x64@0.17.3':
@ -7460,7 +7475,7 @@ snapshots:
'@esbuild/win32-x64@0.19.12':
optional: true
'@esbuild/win32-x64@0.25.5':
'@esbuild/win32-x64@0.25.6':
optional: true
'@eslint-community/eslint-utils@4.7.0(eslint@8.57.1)':
@ -10339,7 +10354,7 @@ snapshots:
esbuild-plugin-inline-worker@0.1.1:
dependencies:
esbuild: 0.25.5
esbuild: 0.25.6
find-cache-dir: 3.3.2
esbuild-register@3.6.0(esbuild@0.19.12):
@ -10425,33 +10440,34 @@ snapshots:
'@esbuild/win32-ia32': 0.19.12
'@esbuild/win32-x64': 0.19.12
esbuild@0.25.5:
esbuild@0.25.6:
optionalDependencies:
'@esbuild/aix-ppc64': 0.25.5
'@esbuild/android-arm': 0.25.5
'@esbuild/android-arm64': 0.25.5
'@esbuild/android-x64': 0.25.5
'@esbuild/darwin-arm64': 0.25.5
'@esbuild/darwin-x64': 0.25.5
'@esbuild/freebsd-arm64': 0.25.5
'@esbuild/freebsd-x64': 0.25.5
'@esbuild/linux-arm': 0.25.5
'@esbuild/linux-arm64': 0.25.5
'@esbuild/linux-ia32': 0.25.5
'@esbuild/linux-loong64': 0.25.5
'@esbuild/linux-mips64el': 0.25.5
'@esbuild/linux-ppc64': 0.25.5
'@esbuild/linux-riscv64': 0.25.5
'@esbuild/linux-s390x': 0.25.5
'@esbuild/linux-x64': 0.25.5
'@esbuild/netbsd-arm64': 0.25.5
'@esbuild/netbsd-x64': 0.25.5
'@esbuild/openbsd-arm64': 0.25.5
'@esbuild/openbsd-x64': 0.25.5
'@esbuild/sunos-x64': 0.25.5
'@esbuild/win32-arm64': 0.25.5
'@esbuild/win32-ia32': 0.25.5
'@esbuild/win32-x64': 0.25.5
'@esbuild/aix-ppc64': 0.25.6
'@esbuild/android-arm': 0.25.6
'@esbuild/android-arm64': 0.25.6
'@esbuild/android-x64': 0.25.6
'@esbuild/darwin-arm64': 0.25.6
'@esbuild/darwin-x64': 0.25.6
'@esbuild/freebsd-arm64': 0.25.6
'@esbuild/freebsd-x64': 0.25.6
'@esbuild/linux-arm': 0.25.6
'@esbuild/linux-arm64': 0.25.6
'@esbuild/linux-ia32': 0.25.6
'@esbuild/linux-loong64': 0.25.6
'@esbuild/linux-mips64el': 0.25.6
'@esbuild/linux-ppc64': 0.25.6
'@esbuild/linux-riscv64': 0.25.6
'@esbuild/linux-s390x': 0.25.6
'@esbuild/linux-x64': 0.25.6
'@esbuild/netbsd-arm64': 0.25.6
'@esbuild/netbsd-x64': 0.25.6
'@esbuild/openbsd-arm64': 0.25.6
'@esbuild/openbsd-x64': 0.25.6
'@esbuild/openharmony-arm64': 0.25.6
'@esbuild/sunos-x64': 0.25.6
'@esbuild/win32-arm64': 0.25.6
'@esbuild/win32-ia32': 0.25.6
'@esbuild/win32-x64': 0.25.6
escalade@3.2.0: {}
@ -11789,6 +11805,8 @@ snapshots:
- supports-color
- ts-node
jieba-wasm@2.2.0: {}
js-base64@3.7.7: {}
js-tiktoken@1.0.20:
@ -12603,7 +12621,7 @@ snapshots:
obsidian-dataview@0.5.68:
dependencies:
'@codemirror/language': https://codeload.github.com/lishid/cm-language/tar.gz/6c1c5f5b677f6f6503d1ca2ec47f62f6406cda67
'@codemirror/language': https://codeload.github.com/lishid/cm-language/tar.gz/a9c3c7efe17dd1d24395ee2a179fe12dd6ed1e76
'@codemirror/state': 6.5.2
'@codemirror/view': 6.38.0
emoji-regex: 10.4.0

View File

@ -163,7 +163,7 @@ export class RAGEngine {
)
}
async processQuery({
async processSimilarityQuery({
query,
scope,
limit,
@ -211,6 +211,221 @@ export class RAGEngine {
return queryResult
}
async processQuery({
query,
scope,
limit,
language,
onQueryProgressChange,
}: {
query: string
scope?: {
files: string[]
folders: string[]
}
limit?: number
language?: string
onQueryProgressChange?: (queryProgress: QueryProgressState) => void
}): Promise<
(Omit<SelectVector, 'embedding'> & {
similarity: number
})[]
> {
if (!this.embeddingModel) {
throw new Error('Embedding model is not set')
}
await this.initializeDimension()
onQueryProgressChange?.({
type: 'querying',
})
// 并行执行相似度搜索和全文搜索
const [similarityResults, fulltextResults] = await Promise.all([
this.processSimilarityQuery({
query,
scope,
limit,
onQueryProgressChange: undefined, // 避免重复触发进度回调
}),
this.processFulltextQuery({
query,
scope,
limit,
language,
onQueryProgressChange: undefined, // 避免重复触发进度回调
}),
])
// 优化:如果其中一个搜索结果为空,直接返回另一个结果
let finalResults: (Omit<SelectVector, 'embedding'> & { similarity: number })[]
if (fulltextResults.length === 0) {
// 全文搜索结果为空,直接返回相似度搜索结果
finalResults = similarityResults
} else if (similarityResults.length === 0) {
// 相似度搜索结果为空,直接返回全文搜索结果(转换格式)
finalResults = fulltextResults.map(result => ({
...result,
similarity: 1 - (result.rank - 1) / fulltextResults.length, // 将rank转换为相似度分数
}))
} else {
// 两个搜索都有结果,使用 RRF 算法合并
const rrf_k = 60 // RRF 常数
const mergedResults = this.mergeWithRRF(similarityResults, fulltextResults, rrf_k)
// 转换为与现有接口兼容的格式
finalResults = mergedResults.map(result => ({
...result,
similarity: result.rrfScore, // 使用 RRF 分数作为相似度
}))
}
onQueryProgressChange?.({
type: 'querying-done',
queryResult: finalResults,
})
return finalResults
}
/**
* 使RRF
* @param similarityResults
* @param fulltextResults
* @param k RRF 60
* @returns RRF
*/
private mergeWithRRF(
similarityResults: (Omit<SelectVector, 'embedding'> & { similarity: number })[],
fulltextResults: (Omit<SelectVector, 'embedding'> & { rank: number })[],
k: number = 60
): (Omit<SelectVector, 'embedding'> & { rrfScore: number })[] {
// 创建一个 Map 来存储每个文档的 RRF 分数
const rrfScores = new Map<string, {
doc: Omit<SelectVector, 'embedding'>,
score: number
}>()
// 处理相似度搜索结果
similarityResults.forEach((result, index) => {
const key = `${result.path}-${result.id}`
const rank = index + 1
const rrfScore = 1 / (k + rank)
if (rrfScores.has(key)) {
const existing = rrfScores.get(key)
if (existing) {
existing.score += rrfScore
}
} else {
rrfScores.set(key, {
doc: {
id: result.id,
path: result.path,
mtime: result.mtime,
content: result.content,
metadata: result.metadata,
},
score: rrfScore
})
}
})
// 处理全文搜索结果
fulltextResults.forEach((result, index) => {
const key = `${result.path}-${result.id}`
const rank = index + 1
const rrfScore = 1 / (k + rank)
if (rrfScores.has(key)) {
const existing = rrfScores.get(key)
if (existing) {
existing.score += rrfScore
}
} else {
rrfScores.set(key, {
doc: {
id: result.id,
path: result.path,
mtime: result.mtime,
content: result.content,
metadata: result.metadata,
},
score: rrfScore
})
}
})
// 转换为数组并进行归一化处理
const results = Array.from(rrfScores.values())
// 找到最大分数用于归一化
const maxScore = Math.max(...results.map(r => r.score))
// 归一化到 0~1 范围并按分数排序
const mergedResults = results
.map(({ doc, score }) => ({
...doc,
rrfScore: maxScore > 0 ? score / maxScore : 0 // 归一化到 0~1
}))
.sort((a, b) => b.rrfScore - a.rrfScore)
return mergedResults
}
async processFulltextQuery({
query,
scope,
limit,
language,
onQueryProgressChange,
}: {
query: string
scope?: {
files: string[]
folders: string[]
}
limit?: number
language?: string
onQueryProgressChange?: (queryProgress: QueryProgressState) => void
}): Promise<
(Omit<SelectVector, 'embedding'> & {
rank: number
})[]
> {
if (!this.embeddingModel) {
throw new Error('Embedding model is not set')
}
await this.initializeDimension()
onQueryProgressChange?.({
type: 'querying',
})
const queryResult = await this.vectorManager.performFulltextSearch(
query,
this.embeddingModel,
{
limit: limit ?? this.settings.ragOptions.limit,
scope,
language: language || 'english',
},
)
onQueryProgressChange?.({
type: 'querying-done',
queryResult: queryResult.map(result => ({
...result,
similarity: result.rank, // 为了兼容 QueryProgressState 类型
})),
})
return queryResult
}
async getEmbedding(query: string): Promise<number[]> {
if (!this.embeddingModel) {
throw new Error('Embedding model is not set')

View File

@ -33,6 +33,71 @@ export class VectorManager {
this.repository = new VectorRepository(app, dbManager.getPgClient() as any)
}
// 添加合并小chunks的辅助方法仅在同一文件内合并
private mergeSmallChunks(chunks: { pageContent: string; metadata: any }[], minChunkSize: number): typeof chunks {
if (!chunks || chunks.length === 0) {
return []
}
const mergedChunks: typeof chunks = []
let currentChunkBuffer = ""
let currentMetadata: any = null
for (const chunk of chunks) {
const content = chunk.pageContent.trim()
if (content.length === 0) continue
// 将当前块加入缓冲区
const combined = currentChunkBuffer ? `${currentChunkBuffer} ${content}` : content
// 更新metadata记录起始和结束位置
const combinedMetadata = currentMetadata ? {
...currentMetadata,
endLine: chunk.metadata?.loc?.lines?.to || chunk.metadata?.endLine || currentMetadata.endLine
} : {
...chunk.metadata,
startLine: chunk.metadata?.loc?.lines?.from || chunk.metadata?.startLine,
endLine: chunk.metadata?.loc?.lines?.to || chunk.metadata?.endLine
}
if (combined.length < minChunkSize) {
// 如果组合后仍然太小,则更新缓冲区并继续循环
currentChunkBuffer = combined
currentMetadata = combinedMetadata
} else {
// 如果组合后达到或超过最小尺寸,将其推入最终数组,并清空缓冲区
mergedChunks.push({
pageContent: combined,
metadata: combinedMetadata
})
currentChunkBuffer = ""
currentMetadata = null
}
}
// 处理循环结束后缓冲区里可能剩下的最后一个小块
if (currentChunkBuffer) {
if (mergedChunks.length > 0) {
// 策略1如果缓冲区有内容将其合并到最后一个块中
const lastChunk = mergedChunks[mergedChunks.length - 1]
lastChunk.pageContent += ` ${currentChunkBuffer}`
lastChunk.metadata.endLine = currentMetadata?.endLine || lastChunk.metadata.endLine
} else {
// 策略2或者如果就没有足够大的块把它自己作为一个块
mergedChunks.push({
pageContent: currentChunkBuffer,
metadata: currentMetadata
})
}
}
console.log("mergedChunks: ", mergedChunks)
return mergedChunks
}
private segmentTextForTsvector(text: string): string {
return this.repository.segmentTextForTsvector(text)
}
async performSimilaritySearch(
queryVector: number[],
embeddingModel: EmbeddingModel,
@ -56,6 +121,29 @@ export class VectorManager {
)
}
async performFulltextSearch(
searchQuery: string,
embeddingModel: EmbeddingModel,
options: {
limit: number
scope?: {
files: string[]
folders: string[]
}
language?: string
},
): Promise<
(Omit<SelectVector, 'embedding'> & {
rank: number
})[]
> {
return await this.repository.performFulltextSearch(
searchQuery,
embeddingModel,
options,
)
}
async getWorkspaceStatistics(
embeddingModel: EmbeddingModel,
workspace?: Workspace
@ -197,7 +285,10 @@ export class VectorManager {
"",
],
});
console.log("textSplitter chunkSize: ", options.chunkSize, "overlap: ", overlap)
// 设置最小chunk大小防止产生太小的chunks
const minChunkSize = Math.max(100, Math.floor(options.chunkSize * 0.3)); // 最小50字符或chunk_size的50%
console.log("textSplitter chunkSize: ", options.chunkSize, "overlap: ", overlap, "minChunkSize: ", minChunkSize)
const skippedFiles: string[] = []
const embeddingProgress = { completed: 0, totalChunks: 0 }
@ -205,7 +296,7 @@ export class VectorManager {
// 分批处理文件每批最多50个文件减少以避免文件句柄耗尽
const FILE_BATCH_SIZE = 50
// 减少批量大小以降低内存压力
const embeddingBatchSize = Math.min(options.batchSize, 10)
const embeddingBatchSize = options.batchSize
// 首先统计总的分块数量用于进度显示
let totalChunks = 0
@ -216,7 +307,13 @@ export class VectorManager {
let fileContent = await this.app.vault.cachedRead(file)
fileContent = fileContent.replace(/\0/g, '')
const fileDocuments = await textSplitter.createDocuments([fileContent])
totalChunks += fileDocuments.length
// 统计阶段也需要使用相同的清理和合并逻辑
const cleanedChunks = fileDocuments.map(chunk => ({
pageContent: removeMarkdown(chunk.pageContent).replace(/\0/g, '').trim(),
metadata: chunk.metadata
})).filter(chunk => chunk.pageContent.length > 0)
const filteredDocuments = this.mergeSmallChunks(cleanedChunks, minChunkSize)
totalChunks += filteredDocuments.length
} catch (error) {
// 统计阶段跳过错误文件
}
@ -246,21 +343,30 @@ export class VectorManager {
const fileDocuments = await textSplitter.createDocuments([
fileContent,
])
return fileDocuments
// 先清理每个chunk的内容然后基于清理后的内容进行合并
const cleanedChunks = fileDocuments.map(chunk => ({
pageContent: removeMarkdown(chunk.pageContent).replace(/\0/g, '').trim(),
metadata: chunk.metadata
})).filter(chunk => chunk.pageContent.length > 0)
const filteredDocuments = this.mergeSmallChunks(cleanedChunks, minChunkSize)
return filteredDocuments
.map((chunk): InsertVector | null => {
// 保存原始内容,不在此处调用 removeMarkdown
const rawContent = chunk.pageContent.replace(/\0/g, '')
if (!rawContent || rawContent.trim().length === 0) {
const cleanContent = chunk.pageContent
if (!cleanContent || cleanContent.trim().length === 0) {
return null
}
// Use Intl.Segmenter to add spaces for better TSVECTOR indexing
const segmentedContent = this.segmentTextForTsvector(cleanContent)
return {
path: file.path,
mtime: file.stat.mtime,
content: rawContent, // 保存原始内容
content: segmentedContent, // 使用分词后的内容
embedding: [],
metadata: {
startLine: Number(chunk.metadata.loc.lines.from),
endLine: Number(chunk.metadata.loc.lines.to),
startLine: Number(chunk.metadata.loc?.lines?.from || chunk.metadata.startLine),
endLine: Number(chunk.metadata.loc?.lines?.to || chunk.metadata.endLine),
},
}
})
@ -280,7 +386,6 @@ export class VectorManager {
// 第二步:嵌入处理
console.log(`Embedding ${batchChunks.length} chunks for current file batch`)
if (embeddingModel.supportsBatch) {
// 支持批量处理的提供商
for (let j = 0; j < batchChunks.length; j += embeddingBatchSize) {
@ -289,26 +394,25 @@ export class VectorManager {
await backOff(
async () => {
// 在嵌入之前处理 markdown
const cleanedBatchData = embeddingBatch.map(chunk => {
const cleanContent = removeMarkdown(chunk.content)
return { chunk, cleanContent }
}).filter(({ cleanContent }) => cleanContent && cleanContent.trim().length > 0)
// 内容已经在前面清理和合并过了,直接使用
const validBatchData = embeddingBatch.filter(chunk =>
chunk.content && chunk.content.trim().length > 0
)
if (cleanedBatchData.length === 0) {
if (validBatchData.length === 0) {
return
}
const batchTexts = cleanedBatchData.map(({ cleanContent }) => cleanContent)
const batchTexts = validBatchData.map(chunk => chunk.content)
const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts)
// 合并embedding结果到chunk数据
for (let k = 0; k < cleanedBatchData.length; k++) {
const { chunk, cleanContent } = cleanedBatchData[k]
for (let k = 0; k < validBatchData.length; k++) {
const chunk = validBatchData[k]
const embeddedChunk: InsertVector = {
path: chunk.path,
mtime: chunk.mtime,
content: cleanContent, // 使用已经清理过的内容
content: chunk.content, // 使用已经清理和合并后的内容
embedding: batchEmbeddings[k],
metadata: chunk.metadata,
}
@ -349,18 +453,18 @@ export class VectorManager {
try {
await backOff(
async () => {
// 在嵌入之前处理 markdown
const cleanContent = removeMarkdown(chunk.content).replace(/\0/g, '')
// 跳过清理后为内容
if (!cleanContent || cleanContent.trim().length === 0) {
// 内容已经在前面清理和合并过了,直接使用
const content = chunk.content.trim()
// 跳过空内容
if (!content || content.length === 0) {
return
}
const embedding = await embeddingModel.getEmbedding(cleanContent)
const embedding = await embeddingModel.getEmbedding(content)
const embeddedChunk = {
path: chunk.path,
mtime: chunk.mtime,
content: cleanContent, // 使用清理后的内容
content: content, // 使用已经清理和合并后的内容
embedding,
metadata: chunk.metadata,
}
@ -495,7 +599,10 @@ export class VectorManager {
"",
],
});
console.log("textSplitter chunkSize: ", options.chunkSize, "overlap: ", overlap)
// 设置最小chunk大小防止产生太小的chunks
const minChunkSize = Math.max(100, Math.floor(options.chunkSize * 0.5)); // 最小50字符或chunk_size的10%
console.log("textSplitter chunkSize: ", options.chunkSize, "overlap: ", overlap, "minChunkSize: ", minChunkSize)
const skippedFiles: string[] = []
const embeddingProgress = { completed: 0, totalChunks: 0 }
@ -503,7 +610,7 @@ export class VectorManager {
// 分批处理文件每批最多50个文件减少以避免文件句柄耗尽
const FILE_BATCH_SIZE = 50
// 减少批量大小以降低内存压力
const embeddingBatchSize = Math.min(options.batchSize, 10)
const embeddingBatchSize = options.batchSize
// 首先统计总的分块数量用于进度显示
let totalChunks = 0
@ -514,7 +621,13 @@ export class VectorManager {
let fileContent = await this.app.vault.cachedRead(file)
fileContent = fileContent.replace(/\0/g, '')
const fileDocuments = await textSplitter.createDocuments([fileContent])
totalChunks += fileDocuments.length
// 统计阶段也需要使用相同的清理和合并逻辑
const cleanedChunks = fileDocuments.map(chunk => ({
pageContent: removeMarkdown(chunk.pageContent).replace(/\0/g, '').trim(),
metadata: chunk.metadata
})).filter(chunk => chunk.pageContent.length > 0)
const filteredDocuments = this.mergeSmallChunks(cleanedChunks, minChunkSize)
totalChunks += filteredDocuments.length
} catch (error) {
// 统计阶段跳过错误文件
}
@ -544,21 +657,30 @@ export class VectorManager {
const fileDocuments = await textSplitter.createDocuments([
fileContent,
])
return fileDocuments
// 先清理每个chunk的内容然后基于清理后的内容进行合并
const cleanedChunks = fileDocuments.map(chunk => ({
pageContent: removeMarkdown(chunk.pageContent).replace(/\0/g, '').trim(),
metadata: chunk.metadata
})).filter(chunk => chunk.pageContent.length > 0)
const filteredDocuments = this.mergeSmallChunks(cleanedChunks, minChunkSize)
return filteredDocuments
.map((chunk): InsertVector | null => {
// 保存原始内容,不在此处调用 removeMarkdown
const rawContent = chunk.pageContent.replace(/\0/g, '')
if (!rawContent || rawContent.trim().length === 0) {
const cleanContent = chunk.pageContent
if (!cleanContent || cleanContent.trim().length === 0) {
return null
}
// Use Intl.Segmenter to add spaces for better TSVECTOR indexing
const segmentedContent = this.segmentTextForTsvector(cleanContent)
return {
path: file.path,
mtime: file.stat.mtime,
content: rawContent, // 保存原始内容
content: segmentedContent, // 使用分词后的内容
embedding: [],
metadata: {
startLine: Number(chunk.metadata.loc.lines.from),
endLine: Number(chunk.metadata.loc.lines.to),
startLine: Number(chunk.metadata.loc?.lines?.from || chunk.metadata.startLine),
endLine: Number(chunk.metadata.loc?.lines?.to || chunk.metadata.endLine),
},
}
})
@ -581,32 +703,35 @@ export class VectorManager {
if (embeddingModel.supportsBatch) {
// 支持批量处理的提供商
console.log("batchChunks", batchChunks.map((chunk, index) => ({
index,
contentLength: chunk.content.length,
})))
for (let j = 0; j < batchChunks.length; j += embeddingBatchSize) {
const embeddingBatch = batchChunks.slice(j, Math.min(j + embeddingBatchSize, batchChunks.length))
const embeddedBatch: InsertVector[] = []
await backOff(
async () => {
// 在嵌入之前处理 markdown
const cleanedBatchData = embeddingBatch.map(chunk => {
const cleanContent = removeMarkdown(chunk.content)
return { chunk, cleanContent }
}).filter(({ cleanContent }) => cleanContent && cleanContent.trim().length > 0)
// 内容已经在前面清理和合并过了,直接使用
const validBatchData = embeddingBatch.filter(chunk =>
chunk.content && chunk.content.trim().length > 0
)
if (cleanedBatchData.length === 0) {
if (validBatchData.length === 0) {
return
}
const batchTexts = cleanedBatchData.map(({ cleanContent }) => cleanContent)
const batchTexts = validBatchData.map(chunk => chunk.content)
const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts)
// 合并embedding结果到chunk数据
for (let k = 0; k < cleanedBatchData.length; k++) {
const { chunk, cleanContent } = cleanedBatchData[k]
for (let k = 0; k < validBatchData.length; k++) {
const chunk = validBatchData[k]
const embeddedChunk: InsertVector = {
path: chunk.path,
mtime: chunk.mtime,
content: cleanContent, // 使用已经清理过的内容
content: chunk.content, // 使用已经清理和合并后的内容
embedding: batchEmbeddings[k],
metadata: chunk.metadata,
}
@ -647,18 +772,18 @@ export class VectorManager {
try {
await backOff(
async () => {
// 在嵌入之前处理 markdown
const cleanContent = removeMarkdown(chunk.content).replace(/\0/g, '')
// 跳过清理后为内容
if (!cleanContent || cleanContent.trim().length === 0) {
// 内容已经在前面清理和合并过了,直接使用
const content = chunk.content.trim()
// 跳过空内容
if (!content || content.length === 0) {
return
}
const embedding = await embeddingModel.getEmbedding(cleanContent)
const embedding = await embeddingModel.getEmbedding(content)
const embeddedChunk = {
path: chunk.path,
mtime: chunk.mtime,
content: cleanContent, // 使用清理后的内容
content: content, // 使用已经清理和合并后的内容
embedding,
metadata: chunk.metadata,
}
@ -756,6 +881,10 @@ export class VectorManager {
"",
],
});
// 设置最小chunk大小防止产生太小的chunks
const minChunkSize = Math.max(50, Math.floor(chunkSize * 0.1)); // 最小50字符或chunk_size的10%
let fileContent = await this.app.vault.cachedRead(file)
// 清理null字节防止PostgreSQL UTF8编码错误
fileContent = fileContent.replace(/\0/g, '')
@ -763,21 +892,30 @@ export class VectorManager {
fileContent,
])
const contentChunks: InsertVector[] = fileDocuments
// 先清理每个chunk的内容然后基于清理后的内容进行合并
const cleanedChunks = fileDocuments.map(chunk => ({
pageContent: removeMarkdown(chunk.pageContent).replace(/\0/g, '').trim(),
metadata: chunk.metadata
})).filter(chunk => chunk.pageContent.length > 0)
const filteredDocuments = this.mergeSmallChunks(cleanedChunks, minChunkSize)
const contentChunks: InsertVector[] = filteredDocuments
.map((chunk): InsertVector | null => {
// 保存原始内容,不在此处调用 removeMarkdown
const rawContent = String(chunk.pageContent || '').replace(/\0/g, '')
if (!rawContent || rawContent.trim().length === 0) {
const cleanContent = chunk.pageContent
if (!cleanContent || cleanContent.trim().length === 0) {
return null
}
// Use Intl.Segmenter to add spaces for better TSVECTOR indexing
const segmentedContent = this.segmentTextForTsvector(cleanContent)
return {
path: file.path,
mtime: file.stat.mtime,
content: rawContent, // 保存原始内容
content: segmentedContent, // 使用分词后的内容
embedding: [],
metadata: {
startLine: Number(chunk.metadata.loc.lines.from),
endLine: Number(chunk.metadata.loc.lines.to),
startLine: Number(chunk.metadata.loc?.lines?.from || chunk.metadata.startLine),
endLine: Number(chunk.metadata.loc?.lines?.to || chunk.metadata.endLine),
},
}
})
@ -797,26 +935,25 @@ export class VectorManager {
await backOff(
async () => {
// 在嵌入之前处理 markdown只处理一次
const cleanedBatchData = batchChunks.map(chunk => {
const cleanContent = removeMarkdown(chunk.content).replace(/\0/g, '')
return { chunk, cleanContent }
}).filter(({ cleanContent }) => cleanContent && cleanContent.trim().length > 0)
// 内容已经在前面清理和合并过了,直接使用
const validBatchData = batchChunks.filter(chunk =>
chunk.content && chunk.content.trim().length > 0
)
if (cleanedBatchData.length === 0) {
if (validBatchData.length === 0) {
return
}
const batchTexts = cleanedBatchData.map(({ cleanContent }) => cleanContent)
const batchTexts = validBatchData.map(chunk => chunk.content)
const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts)
// 合并embedding结果到chunk数据
for (let j = 0; j < cleanedBatchData.length; j++) {
const { chunk, cleanContent } = cleanedBatchData[j]
for (let j = 0; j < validBatchData.length; j++) {
const chunk = validBatchData[j]
const embeddedChunk: InsertVector = {
path: chunk.path,
mtime: chunk.mtime,
content: cleanContent, // 使用已经清理过的内容
content: chunk.content, // 使用已经清理和合并后的内容
embedding: batchEmbeddings[j],
metadata: chunk.metadata,
}
@ -864,18 +1001,18 @@ export class VectorManager {
try {
await backOff(
async () => {
// 在嵌入之前处理 markdown
const cleanContent = removeMarkdown(chunk.content).replace(/\0/g, '')
// 跳过清理后为内容
if (!cleanContent || cleanContent.trim().length === 0) {
// 内容已经在前面清理和合并过了,直接使用
const content = chunk.content.trim()
// 跳过空内容
if (!content || content.length === 0) {
return
}
const embedding = await embeddingModel.getEmbedding(cleanContent)
const embedding = await embeddingModel.getEmbedding(content)
const embeddedChunk = {
path: chunk.path,
mtime: chunk.mtime,
content: cleanContent, // 使用清理后的内容
content: content, // 使用已经清理和合并后的内容
embedding,
metadata: chunk.metadata,
}

View File

@ -8,10 +8,44 @@ import { InsertVector, SelectVector, vectorTables } from '../../schema'
export class VectorRepository {
private app: App
private db: PGliteInterface | null
private stopWords: Set<string>
constructor(app: App, pgClient: PGliteInterface | null) {
this.app = app
this.db = pgClient
this.stopWords = new Set([
// Chinese stop words
'的', '在', '是', '了', '我', '你', '他', '她', '它', '请问', '如何', '一个', '什么', '怎么',
'这', '那', '和', '与', '或', '但', '因为', '所以', '如果', '虽然', '可是', '不过',
'也', '都', '还', '就', '又', '很', '最', '更', '非常', '特别', '比较', '相当',
'对', '于', '把', '被', '让', '使', '给', '为', '从', '到', '向', '往', '朝',
'上', '下', '里', '外', '前', '后', '左', '右', '中', '间', '内', '以', '及',
// English stop words
'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he',
'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will',
'with', 'would', 'could', 'should', 'can', 'may', 'might', 'must', 'shall',
'this', 'that', 'these', 'those', 'i', 'you', 'we', 'they', 'me', 'him', 'her',
'us', 'them', 'my', 'your', 'his', 'our', 'their', 'am', 'have', 'had', 'do',
'does', 'did', 'get', 'got', 'go', 'went', 'come', 'came', 'make', 'made',
'take', 'took', 'see', 'saw', 'know', 'knew', 'think', 'thought', 'say', 'said',
'tell', 'told', 'ask', 'asked', 'give', 'gave', 'find', 'found', 'work', 'worked',
'call', 'called', 'try', 'tried', 'need', 'needed', 'feel', 'felt', 'become',
'became', 'leave', 'left', 'put', 'keep', 'kept', 'let', 'begin', 'began',
'seem', 'seemed', 'help', 'helped', 'show', 'showed', 'hear', 'heard', 'play',
'played', 'run', 'ran', 'move', 'moved', 'live', 'lived', 'believe', 'believed',
'hold', 'held', 'bring', 'brought', 'happen', 'happened', 'write', 'wrote',
'sit', 'sat', 'stand', 'stood', 'lose', 'lost', 'pay', 'paid', 'meet', 'met',
'include', 'included', 'continue', 'continued', 'set', 'learn', 'learned',
'change', 'changed', 'lead', 'led', 'understand', 'understood', 'watch', 'watched',
'follow', 'followed', 'stop', 'stopped', 'create', 'created', 'speak', 'spoke',
'read', 'remember', 'remembered', 'consider', 'considered', 'appear', 'appeared',
'buy', 'bought', 'wait', 'waited', 'serve', 'served', 'die', 'died', 'send',
'sent', 'expect', 'expected', 'build', 'built', 'stay', 'stayed', 'fall', 'fell',
'cut', 'reach', 'reached', 'kill', 'killed', 'remain', 'remained', 'suggest',
'suggested', 'raise', 'raised', 'pass', 'passed', 'sell', 'sold', 'require',
'required', 'report', 'reported', 'decide', 'decided', 'pull', 'pulled'
])
}
private getTableName(embeddingModel: EmbeddingModel): string {
@ -186,9 +220,170 @@ export class VectorRepository {
type SearchResult = Omit<SelectVector, 'embedding'> & { similarity: number }
const result = await this.db.query<SearchResult>(query, params)
console.log("performSimilaritySearch result", result.rows)
return result.rows
}
async performFulltextSearch(
searchQuery: string,
embeddingModel: EmbeddingModel,
options: {
limit: number
scope?: {
files: string[]
folders: string[]
}
language?: string
},
): Promise<
(Omit<SelectVector, 'embedding'> & {
rank: number
})[]
> {
if (!this.db) {
throw new DatabaseNotInitializedException()
}
// handle query processing with segmentation and stop words filtering
const processedQuery = this.createFtsQuery(searchQuery, options.language || 'english')
const tableName = this.getTableName(embeddingModel)
const language = options.language || 'english'
let scopeCondition = ''
const params: unknown[] = [processedQuery, options.limit]
let paramIndex = 3
if (options.scope) {
const conditions: string[] = []
if (options.scope.files.length > 0) {
conditions.push(`path = ANY($${paramIndex})`)
params.push(options.scope.files)
paramIndex++
}
if (options.scope.folders.length > 0) {
const folderConditions = options.scope.folders.map((folder, idx) => {
params.push(`${folder}/%`)
return `path LIKE $${paramIndex + idx}`
})
conditions.push(`(${folderConditions.join(' OR ')})`)
paramIndex += options.scope.folders.length
}
if (conditions.length > 0) {
scopeCondition = `AND (${conditions.join(' OR ')})`
}
}
const query = `
SELECT
id, path, mtime, content, metadata,
ts_rank_cd(
COALESCE(content_tsv, to_tsvector('${language}', coalesce(content, ''))),
to_tsquery('${language}', $1)
) AS rank
FROM "${tableName}"
WHERE (
content_tsv @@ to_tsquery('${language}', $1)
OR (content_tsv IS NULL AND to_tsvector('${language}', coalesce(content, '')) @@ to_tsquery('${language}', $1))
)
${scopeCondition}
ORDER BY rank DESC
LIMIT $2
`
console.log("performFulltextSearch query", query)
type SearchResult = Omit<SelectVector, 'embedding'> & { rank: number }
const result = await this.db.query<SearchResult>(query, params)
console.log("performFulltextSearch result", result.rows)
return result.rows
}
public segmentTextForTsvector(text: string, language: string = 'zh-CN'): string {
try {
// Use Intl.Segmenter to add spaces between words for better TSVECTOR indexing
if (typeof Intl !== 'undefined' && Intl.Segmenter) {
const segmenter = new Intl.Segmenter(language, { granularity: 'word' })
const segments = segmenter.segment(text)
const segmentedText = Array.from(segments)
.map(segment => segment.segment)
.join(' ')
return segmentedText
}
// Fallback: add spaces around Chinese characters and punctuation
return text.replace(/([一-龯])/g, ' $1 ')
.replace(/\s+/g, ' ')
.trim()
} catch (error) {
console.warn('Failed to segment text for TSVECTOR:', error)
return text
}
}
private createFtsQuery(query: string, language: string): string {
try {
let keywords: string[] = []
// Try to use Intl.Segmenter for word segmentation
if (typeof Intl !== 'undefined' && Intl.Segmenter) {
try {
const segmenter = new Intl.Segmenter(language, { granularity: 'word' })
const segments = segmenter.segment(query)
keywords = Array.from(segments)
.filter(s => s.isWordLike)
.map(s => s.segment.trim())
.filter(word => {
// Filter out empty strings and stop words
if (!word || word.length === 0) return false
return !this.stopWords.has(word.toLowerCase())
})
.filter(word => {
// Keep all words with length > 0 since stop words are already filtered
return word.length > 0
})
} catch (segmentError) {
console.warn('Intl.Segmenter failed, falling back to simple splitting:', segmentError)
}
}
// Fallback to simple word splitting if Intl.Segmenter is not available or failed
if (keywords.length === 0) {
keywords = query
.split(/[\s\p{P}\p{S}]+/u) // Split by whitespace, punctuation, and symbols
.map(word => word.trim())
.filter(word => {
if (!word || word.length === 0) return false
return !this.stopWords.has(word.toLowerCase())
})
.filter(word => {
// Keep all words with length > 0 since stop words are already filtered
return word.length > 0
})
}
// If no keywords remain, return original query
if (keywords.length === 0) {
return query
}
// Join keywords with & for PostgreSQL full-text search
const ftsQueryString = keywords.join(' | ')
console.log(`Original query: "${query}" -> Processed query: "${ftsQueryString}"`)
return ftsQueryString
} catch (error) {
// If all processing fails, return original query
console.warn('Failed to process FTS query:', error)
return query
}
}
async getWorkspaceStatistics(
embeddingModel: EmbeddingModel,
scope?: {

View File

@ -261,5 +261,108 @@ export const migrations: Record<string, SqlMigration> = {
ALTER TABLE "source_insight_512" ADD COLUMN IF NOT EXISTS "source_mtime" bigint NOT NULL DEFAULT 0;
ALTER TABLE "source_insight_384" ADD COLUMN IF NOT EXISTS "source_mtime" bigint NOT NULL DEFAULT 0;
`
},
full_text_search: {
description: "Adds full-text search capabilities to embedding and source insight tables",
sql: `
-- Add content_tsv columns to embedding tables
ALTER TABLE "embeddings_1536" ADD COLUMN IF NOT EXISTS "content_tsv" TSVECTOR;
ALTER TABLE "embeddings_1024" ADD COLUMN IF NOT EXISTS "content_tsv" TSVECTOR;
ALTER TABLE "embeddings_768" ADD COLUMN IF NOT EXISTS "content_tsv" TSVECTOR;
ALTER TABLE "embeddings_512" ADD COLUMN IF NOT EXISTS "content_tsv" TSVECTOR;
ALTER TABLE "embeddings_384" ADD COLUMN IF NOT EXISTS "content_tsv" TSVECTOR;
-- Add insight_tsv columns to source insight tables
ALTER TABLE "source_insight_1536" ADD COLUMN IF NOT EXISTS "insight_tsv" TSVECTOR;
ALTER TABLE "source_insight_1024" ADD COLUMN IF NOT EXISTS "insight_tsv" TSVECTOR;
ALTER TABLE "source_insight_768" ADD COLUMN IF NOT EXISTS "insight_tsv" TSVECTOR;
ALTER TABLE "source_insight_512" ADD COLUMN IF NOT EXISTS "insight_tsv" TSVECTOR;
ALTER TABLE "source_insight_384" ADD COLUMN IF NOT EXISTS "insight_tsv" TSVECTOR;
-- Create trigger function for embeddings tables
CREATE OR REPLACE FUNCTION embeddings_tsv_trigger() RETURNS trigger AS $$
BEGIN
NEW.content_tsv := to_tsvector('english', coalesce(NEW.content, ''));
RETURN NEW;
END
$$ LANGUAGE plpgsql;
-- Create trigger function for source insight tables
CREATE OR REPLACE FUNCTION source_insight_tsv_trigger() RETURNS trigger AS $$
BEGIN
NEW.insight_tsv := to_tsvector('english', coalesce(NEW.insight, ''));
RETURN NEW;
END
$$ LANGUAGE plpgsql;
-- Create triggers for embeddings tables (drop if exists first)
DROP TRIGGER IF EXISTS tsvector_update_embeddings_1536 ON "embeddings_1536";
CREATE TRIGGER tsvector_update_embeddings_1536
BEFORE INSERT OR UPDATE ON "embeddings_1536"
FOR EACH ROW EXECUTE FUNCTION embeddings_tsv_trigger();
DROP TRIGGER IF EXISTS tsvector_update_embeddings_1024 ON "embeddings_1024";
CREATE TRIGGER tsvector_update_embeddings_1024
BEFORE INSERT OR UPDATE ON "embeddings_1024"
FOR EACH ROW EXECUTE FUNCTION embeddings_tsv_trigger();
DROP TRIGGER IF EXISTS tsvector_update_embeddings_768 ON "embeddings_768";
CREATE TRIGGER tsvector_update_embeddings_768
BEFORE INSERT OR UPDATE ON "embeddings_768"
FOR EACH ROW EXECUTE FUNCTION embeddings_tsv_trigger();
DROP TRIGGER IF EXISTS tsvector_update_embeddings_512 ON "embeddings_512";
CREATE TRIGGER tsvector_update_embeddings_512
BEFORE INSERT OR UPDATE ON "embeddings_512"
FOR EACH ROW EXECUTE FUNCTION embeddings_tsv_trigger();
DROP TRIGGER IF EXISTS tsvector_update_embeddings_384 ON "embeddings_384";
CREATE TRIGGER tsvector_update_embeddings_384
BEFORE INSERT OR UPDATE ON "embeddings_384"
FOR EACH ROW EXECUTE FUNCTION embeddings_tsv_trigger();
-- Create triggers for source insight tables (drop if exists first)
DROP TRIGGER IF EXISTS tsvector_update_source_insight_1536 ON "source_insight_1536";
CREATE TRIGGER tsvector_update_source_insight_1536
BEFORE INSERT OR UPDATE ON "source_insight_1536"
FOR EACH ROW EXECUTE FUNCTION source_insight_tsv_trigger();
DROP TRIGGER IF EXISTS tsvector_update_source_insight_1024 ON "source_insight_1024";
CREATE TRIGGER tsvector_update_source_insight_1024
BEFORE INSERT OR UPDATE ON "source_insight_1024"
FOR EACH ROW EXECUTE FUNCTION source_insight_tsv_trigger();
DROP TRIGGER IF EXISTS tsvector_update_source_insight_768 ON "source_insight_768";
CREATE TRIGGER tsvector_update_source_insight_768
BEFORE INSERT OR UPDATE ON "source_insight_768"
FOR EACH ROW EXECUTE FUNCTION source_insight_tsv_trigger();
DROP TRIGGER IF EXISTS tsvector_update_source_insight_512 ON "source_insight_512";
CREATE TRIGGER tsvector_update_source_insight_512
BEFORE INSERT OR UPDATE ON "source_insight_512"
FOR EACH ROW EXECUTE FUNCTION source_insight_tsv_trigger();
DROP TRIGGER IF EXISTS tsvector_update_source_insight_384 ON "source_insight_384";
CREATE TRIGGER tsvector_update_source_insight_384
BEFORE INSERT OR UPDATE ON "source_insight_384"
FOR EACH ROW EXECUTE FUNCTION source_insight_tsv_trigger();
-- Note: 现有数据的 tsvector NULL trigger
-- UPDATE
-- Create GIN indexes for full-text search on embeddings tables
CREATE INDEX IF NOT EXISTS "embeddings_content_tsv_idx_1536" ON "embeddings_1536" USING GIN(content_tsv);
CREATE INDEX IF NOT EXISTS "embeddings_content_tsv_idx_1024" ON "embeddings_1024" USING GIN(content_tsv);
CREATE INDEX IF NOT EXISTS "embeddings_content_tsv_idx_768" ON "embeddings_768" USING GIN(content_tsv);
CREATE INDEX IF NOT EXISTS "embeddings_content_tsv_idx_512" ON "embeddings_512" USING GIN(content_tsv);
CREATE INDEX IF NOT EXISTS "embeddings_content_tsv_idx_384" ON "embeddings_384" USING GIN(content_tsv);
-- Create GIN indexes for full-text search on source insight tables
CREATE INDEX IF NOT EXISTS "source_insight_tsv_idx_1536" ON "source_insight_1536" USING GIN(insight_tsv);
CREATE INDEX IF NOT EXISTS "source_insight_tsv_idx_1024" ON "source_insight_1024" USING GIN(insight_tsv);
CREATE INDEX IF NOT EXISTS "source_insight_tsv_idx_768" ON "source_insight_768" USING GIN(insight_tsv);
CREATE INDEX IF NOT EXISTS "source_insight_tsv_idx_512" ON "source_insight_512" USING GIN(insight_tsv);
CREATE INDEX IF NOT EXISTS "source_insight_tsv_idx_384" ON "source_insight_384" USING GIN(insight_tsv);
`
}
};

View File

@ -78,6 +78,7 @@ worker({
// Execute SQL migrations
for (const [_key, migration] of Object.entries(migrations)) {
// Split SQL into individual commands and execute them one by one
console.log("migration: ", migration.description)
const commands = migration.sql.split('\n\n').filter(cmd => cmd.trim());
for (const command of commands) {
await db.exec(command);