diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 33d1241..0034512 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -123,6 +123,9 @@ importers: handlebars: specifier: ^4.7.7 version: 4.7.8 + jieba-wasm: + specifier: ^2.2.0 + version: 2.2.0 js-tiktoken: specifier: ^1.0.15 version: 1.0.20 @@ -583,9 +586,9 @@ packages: '@codemirror/language@6.11.2': resolution: {integrity: sha512-p44TsNArL4IVXDTbapUmEkAlvWs2CFQbcfc0ymDsis1kH2wh0gcY96AS29c/vp2d0y2Tquk1EDSaawpzilUiAw==} - '@codemirror/language@https://codeload.github.com/lishid/cm-language/tar.gz/6c1c5f5b677f6f6503d1ca2ec47f62f6406cda67': - resolution: {tarball: https://codeload.github.com/lishid/cm-language/tar.gz/6c1c5f5b677f6f6503d1ca2ec47f62f6406cda67} - version: 6.10.8 + '@codemirror/language@https://codeload.github.com/lishid/cm-language/tar.gz/a9c3c7efe17dd1d24395ee2a179fe12dd6ed1e76': + resolution: {tarball: https://codeload.github.com/lishid/cm-language/tar.gz/a9c3c7efe17dd1d24395ee2a179fe12dd6ed1e76} + version: 6.11.2 '@codemirror/lint@0.20.3': resolution: {integrity: sha512-06xUScbbspZ8mKoODQCEx6hz1bjaq9m8W8DxdycWARMiiX1wMtfCh/MoHpaL7ws/KUMwlsFFfp2qhm32oaCvVA==} @@ -669,8 +672,8 @@ packages: cpu: [ppc64] os: [aix] - '@esbuild/aix-ppc64@0.25.5': - resolution: {integrity: sha512-9o3TMmpmftaCMepOdA5k/yDw8SfInyzWWTjYTFCX3kPSDJMROQTb8jg+h9Cnwnmm1vOzvxN7gIfB5V2ewpjtGA==} + '@esbuild/aix-ppc64@0.25.6': + resolution: {integrity: sha512-ShbM/3XxwuxjFiuVBHA+d3j5dyac0aEVVq1oluIDf71hUw0aRF59dV/efUsIwFnR6m8JNM2FjZOzmaZ8yG61kw==} engines: {node: '>=18'} cpu: [ppc64] os: [aix] @@ -693,8 +696,8 @@ packages: cpu: [arm64] os: [android] - '@esbuild/android-arm64@0.25.5': - resolution: {integrity: sha512-VGzGhj4lJO+TVGV1v8ntCZWJktV7SGCs3Pn1GRWI1SBFtRALoomm8k5E9Pmwg3HOAal2VDc2F9+PM/rEY6oIDg==} + '@esbuild/android-arm64@0.25.6': + resolution: {integrity: sha512-hd5zdUarsK6strW+3Wxi5qWws+rJhCCbMiC9QZyzoxfk5uHRIE8T287giQxzVpEvCwuJ9Qjg6bEjcRJcgfLqoA==} engines: {node: '>=18'} cpu: [arm64] os: [android] @@ -717,8 +720,8 @@ packages: cpu: [arm] os: [android] - '@esbuild/android-arm@0.25.5': - resolution: {integrity: sha512-AdJKSPeEHgi7/ZhuIPtcQKr5RQdo6OO2IL87JkianiMYMPbCtot9fxPbrMiBADOWWm3T2si9stAiVsGbTQFkbA==} + '@esbuild/android-arm@0.25.6': + resolution: {integrity: sha512-S8ToEOVfg++AU/bHwdksHNnyLyVM+eMVAOf6yRKFitnwnbwwPNqKr3srzFRe7nzV69RQKb5DgchIX5pt3L53xg==} engines: {node: '>=18'} cpu: [arm] os: [android] @@ -741,8 +744,8 @@ packages: cpu: [x64] os: [android] - '@esbuild/android-x64@0.25.5': - resolution: {integrity: sha512-D2GyJT1kjvO//drbRT3Hib9XPwQeWd9vZoBJn+bu/lVsOZ13cqNdDeqIF/xQ5/VmWvMduP6AmXvylO/PIc2isw==} + '@esbuild/android-x64@0.25.6': + resolution: {integrity: sha512-0Z7KpHSr3VBIO9A/1wcT3NTy7EB4oNC4upJ5ye3R7taCc2GUdeynSLArnon5G8scPwaU866d3H4BCrE5xLW25A==} engines: {node: '>=18'} cpu: [x64] os: [android] @@ -765,8 +768,8 @@ packages: cpu: [arm64] os: [darwin] - '@esbuild/darwin-arm64@0.25.5': - resolution: {integrity: sha512-GtaBgammVvdF7aPIgH2jxMDdivezgFu6iKpmT+48+F8Hhg5J/sfnDieg0aeG/jfSvkYQU2/pceFPDKlqZzwnfQ==} + '@esbuild/darwin-arm64@0.25.6': + resolution: {integrity: sha512-FFCssz3XBavjxcFxKsGy2DYK5VSvJqa6y5HXljKzhRZ87LvEi13brPrf/wdyl/BbpbMKJNOr1Sd0jtW4Ge1pAA==} engines: {node: '>=18'} cpu: [arm64] os: [darwin] @@ -789,8 +792,8 @@ packages: cpu: [x64] os: [darwin] - '@esbuild/darwin-x64@0.25.5': - resolution: {integrity: sha512-1iT4FVL0dJ76/q1wd7XDsXrSW+oLoquptvh4CLR4kITDtqi2e/xwXwdCVH8hVHU43wgJdsq7Gxuzcs6Iq/7bxQ==} + '@esbuild/darwin-x64@0.25.6': + resolution: {integrity: sha512-GfXs5kry/TkGM2vKqK2oyiLFygJRqKVhawu3+DOCk7OxLy/6jYkWXhlHwOoTb0WqGnWGAS7sooxbZowy+pK9Yg==} engines: {node: '>=18'} cpu: [x64] os: [darwin] @@ -813,8 +816,8 @@ packages: cpu: [arm64] os: [freebsd] - '@esbuild/freebsd-arm64@0.25.5': - resolution: {integrity: sha512-nk4tGP3JThz4La38Uy/gzyXtpkPW8zSAmoUhK9xKKXdBCzKODMc2adkB2+8om9BDYugz+uGV7sLmpTYzvmz6Sw==} + '@esbuild/freebsd-arm64@0.25.6': + resolution: {integrity: sha512-aoLF2c3OvDn2XDTRvn8hN6DRzVVpDlj2B/F66clWd/FHLiHaG3aVZjxQX2DYphA5y/evbdGvC6Us13tvyt4pWg==} engines: {node: '>=18'} cpu: [arm64] os: [freebsd] @@ -837,8 +840,8 @@ packages: cpu: [x64] os: [freebsd] - '@esbuild/freebsd-x64@0.25.5': - resolution: {integrity: sha512-PrikaNjiXdR2laW6OIjlbeuCPrPaAl0IwPIaRv+SMV8CiM8i2LqVUHFC1+8eORgWyY7yhQY+2U2fA55mBzReaw==} + '@esbuild/freebsd-x64@0.25.6': + resolution: {integrity: sha512-2SkqTjTSo2dYi/jzFbU9Plt1vk0+nNg8YC8rOXXea+iA3hfNJWebKYPs3xnOUf9+ZWhKAaxnQNUf2X9LOpeiMQ==} engines: {node: '>=18'} cpu: [x64] os: [freebsd] @@ -861,8 +864,8 @@ packages: cpu: [arm64] os: [linux] - '@esbuild/linux-arm64@0.25.5': - resolution: {integrity: sha512-Z9kfb1v6ZlGbWj8EJk9T6czVEjjq2ntSYLY2cw6pAZl4oKtfgQuS4HOq41M/BcoLPzrUbNd+R4BXFyH//nHxVg==} + '@esbuild/linux-arm64@0.25.6': + resolution: {integrity: sha512-b967hU0gqKd9Drsh/UuAm21Khpoh6mPBSgz8mKRq4P5mVK8bpA+hQzmm/ZwGVULSNBzKdZPQBRT3+WuVavcWsQ==} engines: {node: '>=18'} cpu: [arm64] os: [linux] @@ -885,8 +888,8 @@ packages: cpu: [arm] os: [linux] - '@esbuild/linux-arm@0.25.5': - resolution: {integrity: sha512-cPzojwW2okgh7ZlRpcBEtsX7WBuqbLrNXqLU89GxWbNt6uIg78ET82qifUy3W6OVww6ZWobWub5oqZOVtwolfw==} + '@esbuild/linux-arm@0.25.6': + resolution: {integrity: sha512-SZHQlzvqv4Du5PrKE2faN0qlbsaW/3QQfUUc6yO2EjFcA83xnwm91UbEEVx4ApZ9Z5oG8Bxz4qPE+HFwtVcfyw==} engines: {node: '>=18'} cpu: [arm] os: [linux] @@ -909,8 +912,8 @@ packages: cpu: [ia32] os: [linux] - '@esbuild/linux-ia32@0.25.5': - resolution: {integrity: sha512-sQ7l00M8bSv36GLV95BVAdhJ2QsIbCuCjh/uYrWiMQSUuV+LpXwIqhgJDcvMTj+VsQmqAHL2yYaasENvJ7CDKA==} + '@esbuild/linux-ia32@0.25.6': + resolution: {integrity: sha512-aHWdQ2AAltRkLPOsKdi3xv0mZ8fUGPdlKEjIEhxCPm5yKEThcUjHpWB1idN74lfXGnZ5SULQSgtr5Qos5B0bPw==} engines: {node: '>=18'} cpu: [ia32] os: [linux] @@ -933,8 +936,8 @@ packages: cpu: [loong64] os: [linux] - '@esbuild/linux-loong64@0.25.5': - resolution: {integrity: sha512-0ur7ae16hDUC4OL5iEnDb0tZHDxYmuQyhKhsPBV8f99f6Z9KQM02g33f93rNH5A30agMS46u2HP6qTdEt6Q1kg==} + '@esbuild/linux-loong64@0.25.6': + resolution: {integrity: sha512-VgKCsHdXRSQ7E1+QXGdRPlQ/e08bN6WMQb27/TMfV+vPjjTImuT9PmLXupRlC90S1JeNNW5lzkAEO/McKeJ2yg==} engines: {node: '>=18'} cpu: [loong64] os: [linux] @@ -957,8 +960,8 @@ packages: cpu: [mips64el] os: [linux] - '@esbuild/linux-mips64el@0.25.5': - resolution: {integrity: sha512-kB/66P1OsHO5zLz0i6X0RxlQ+3cu0mkxS3TKFvkb5lin6uwZ/ttOkP3Z8lfR9mJOBk14ZwZ9182SIIWFGNmqmg==} + '@esbuild/linux-mips64el@0.25.6': + resolution: {integrity: sha512-WViNlpivRKT9/py3kCmkHnn44GkGXVdXfdc4drNmRl15zVQ2+D2uFwdlGh6IuK5AAnGTo2qPB1Djppj+t78rzw==} engines: {node: '>=18'} cpu: [mips64el] os: [linux] @@ -981,8 +984,8 @@ packages: cpu: [ppc64] os: [linux] - '@esbuild/linux-ppc64@0.25.5': - resolution: {integrity: sha512-UZCmJ7r9X2fe2D6jBmkLBMQetXPXIsZjQJCjgwpVDz+YMcS6oFR27alkgGv3Oqkv07bxdvw7fyB71/olceJhkQ==} + '@esbuild/linux-ppc64@0.25.6': + resolution: {integrity: sha512-wyYKZ9NTdmAMb5730I38lBqVu6cKl4ZfYXIs31Baf8aoOtB4xSGi3THmDYt4BTFHk7/EcVixkOV2uZfwU3Q2Jw==} engines: {node: '>=18'} cpu: [ppc64] os: [linux] @@ -1005,8 +1008,8 @@ packages: cpu: [riscv64] os: [linux] - '@esbuild/linux-riscv64@0.25.5': - resolution: {integrity: sha512-kTxwu4mLyeOlsVIFPfQo+fQJAV9mh24xL+y+Bm6ej067sYANjyEw1dNHmvoqxJUCMnkBdKpvOn0Ahql6+4VyeA==} + '@esbuild/linux-riscv64@0.25.6': + resolution: {integrity: sha512-KZh7bAGGcrinEj4qzilJ4hqTY3Dg2U82c8bv+e1xqNqZCrCyc+TL9AUEn5WGKDzm3CfC5RODE/qc96OcbIe33w==} engines: {node: '>=18'} cpu: [riscv64] os: [linux] @@ -1029,8 +1032,8 @@ packages: cpu: [s390x] os: [linux] - '@esbuild/linux-s390x@0.25.5': - resolution: {integrity: sha512-K2dSKTKfmdh78uJ3NcWFiqyRrimfdinS5ErLSn3vluHNeHVnBAFWC8a4X5N+7FgVE1EjXS1QDZbpqZBjfrqMTQ==} + '@esbuild/linux-s390x@0.25.6': + resolution: {integrity: sha512-9N1LsTwAuE9oj6lHMyyAM+ucxGiVnEqUdp4v7IaMmrwb06ZTEVCIs3oPPplVsnjPfyjmxwHxHMF8b6vzUVAUGw==} engines: {node: '>=18'} cpu: [s390x] os: [linux] @@ -1053,14 +1056,14 @@ packages: cpu: [x64] os: [linux] - '@esbuild/linux-x64@0.25.5': - resolution: {integrity: sha512-uhj8N2obKTE6pSZ+aMUbqq+1nXxNjZIIjCjGLfsWvVpy7gKCOL6rsY1MhRh9zLtUtAI7vpgLMK6DxjO8Qm9lJw==} + '@esbuild/linux-x64@0.25.6': + resolution: {integrity: sha512-A6bJB41b4lKFWRKNrWoP2LHsjVzNiaurf7wyj/XtFNTsnPuxwEBWHLty+ZE0dWBKuSK1fvKgrKaNjBS7qbFKig==} engines: {node: '>=18'} cpu: [x64] os: [linux] - '@esbuild/netbsd-arm64@0.25.5': - resolution: {integrity: sha512-pwHtMP9viAy1oHPvgxtOv+OkduK5ugofNTVDilIzBLpoWAM16r7b/mxBvfpuQDpRQFMfuVr5aLcn4yveGvBZvw==} + '@esbuild/netbsd-arm64@0.25.6': + resolution: {integrity: sha512-IjA+DcwoVpjEvyxZddDqBY+uJ2Snc6duLpjmkXm/v4xuS3H+3FkLZlDm9ZsAbF9rsfP3zeA0/ArNDORZgrxR/Q==} engines: {node: '>=18'} cpu: [arm64] os: [netbsd] @@ -1083,14 +1086,14 @@ packages: cpu: [x64] os: [netbsd] - '@esbuild/netbsd-x64@0.25.5': - resolution: {integrity: sha512-WOb5fKrvVTRMfWFNCroYWWklbnXH0Q5rZppjq0vQIdlsQKuw6mdSihwSo4RV/YdQ5UCKKvBy7/0ZZYLBZKIbwQ==} + '@esbuild/netbsd-x64@0.25.6': + resolution: {integrity: sha512-dUXuZr5WenIDlMHdMkvDc1FAu4xdWixTCRgP7RQLBOkkGgwuuzaGSYcOpW4jFxzpzL1ejb8yF620UxAqnBrR9g==} engines: {node: '>=18'} cpu: [x64] os: [netbsd] - '@esbuild/openbsd-arm64@0.25.5': - resolution: {integrity: sha512-7A208+uQKgTxHd0G0uqZO8UjK2R0DDb4fDmERtARjSHWxqMTye4Erz4zZafx7Di9Cv+lNHYuncAkiGFySoD+Mw==} + '@esbuild/openbsd-arm64@0.25.6': + resolution: {integrity: sha512-l8ZCvXP0tbTJ3iaqdNf3pjaOSd5ex/e6/omLIQCVBLmHTlfXW3zAxQ4fnDmPLOB1x9xrcSi/xtCWFwCZRIaEwg==} engines: {node: '>=18'} cpu: [arm64] os: [openbsd] @@ -1113,12 +1116,18 @@ packages: cpu: [x64] os: [openbsd] - '@esbuild/openbsd-x64@0.25.5': - resolution: {integrity: sha512-G4hE405ErTWraiZ8UiSoesH8DaCsMm0Cay4fsFWOOUcz8b8rC6uCvnagr+gnioEjWn0wC+o1/TAHt+It+MpIMg==} + '@esbuild/openbsd-x64@0.25.6': + resolution: {integrity: sha512-hKrmDa0aOFOr71KQ/19JC7az1P0GWtCN1t2ahYAf4O007DHZt/dW8ym5+CUdJhQ/qkZmI1HAF8KkJbEFtCL7gw==} engines: {node: '>=18'} cpu: [x64] os: [openbsd] + '@esbuild/openharmony-arm64@0.25.6': + resolution: {integrity: sha512-+SqBcAWoB1fYKmpWoQP4pGtx+pUUC//RNYhFdbcSA16617cchuryuhOCRpPsjCblKukAckWsV+aQ3UKT/RMPcA==} + engines: {node: '>=18'} + cpu: [arm64] + os: [openharmony] + '@esbuild/sunos-x64@0.17.3': resolution: {integrity: sha512-RxmhKLbTCDAY2xOfrww6ieIZkZF+KBqG7S2Ako2SljKXRFi+0863PspK74QQ7JpmWwncChY25JTJSbVBYGQk2Q==} engines: {node: '>=12'} @@ -1137,8 +1146,8 @@ packages: cpu: [x64] os: [sunos] - '@esbuild/sunos-x64@0.25.5': - resolution: {integrity: sha512-l+azKShMy7FxzY0Rj4RCt5VD/q8mG/e+mDivgspo+yL8zW7qEwctQ6YqKX34DTEleFAvCIUviCFX1SDZRSyMQA==} + '@esbuild/sunos-x64@0.25.6': + resolution: {integrity: sha512-dyCGxv1/Br7MiSC42qinGL8KkG4kX0pEsdb0+TKhmJZgCUDBGmyo1/ArCjNGiOLiIAgdbWgmWgib4HoCi5t7kA==} engines: {node: '>=18'} cpu: [x64] os: [sunos] @@ -1161,8 +1170,8 @@ packages: cpu: [arm64] os: [win32] - '@esbuild/win32-arm64@0.25.5': - resolution: {integrity: sha512-O2S7SNZzdcFG7eFKgvwUEZ2VG9D/sn/eIiz8XRZ1Q/DO5a3s76Xv0mdBzVM5j5R639lXQmPmSo0iRpHqUUrsxw==} + '@esbuild/win32-arm64@0.25.6': + resolution: {integrity: sha512-42QOgcZeZOvXfsCBJF5Afw73t4veOId//XD3i+/9gSkhSV6Gk3VPlWncctI+JcOyERv85FUo7RxuxGy+z8A43Q==} engines: {node: '>=18'} cpu: [arm64] os: [win32] @@ -1185,8 +1194,8 @@ packages: cpu: [ia32] os: [win32] - '@esbuild/win32-ia32@0.25.5': - resolution: {integrity: sha512-onOJ02pqs9h1iMJ1PQphR+VZv8qBMQ77Klcsqv9CNW2w6yLqoURLcgERAIurY6QE63bbLuqgP9ATqajFLK5AMQ==} + '@esbuild/win32-ia32@0.25.6': + resolution: {integrity: sha512-4AWhgXmDuYN7rJI6ORB+uU9DHLq/erBbuMoAuB4VWJTu5KtCgcKYPynF0YI1VkBNuEfjNlLrFr9KZPJzrtLkrQ==} engines: {node: '>=18'} cpu: [ia32] os: [win32] @@ -1209,8 +1218,8 @@ packages: cpu: [x64] os: [win32] - '@esbuild/win32-x64@0.25.5': - resolution: {integrity: sha512-TXv6YnJ8ZMVdX+SXWVBo/0p8LTcrUYngpWjvm91TMjjBQii7Oz11Lw5lbDV5Y0TzuhSJHwiH4hEtC1I42mMS0g==} + '@esbuild/win32-x64@0.25.6': + resolution: {integrity: sha512-NgJPHHbEpLQgDH2MjQu90pzW/5vvXIZ7KOnPyNBm92A6WgZ/7b6fJyUBjoumLqeOQQGqY2QjQxRo97ah4Sj0cA==} engines: {node: '>=18'} cpu: [x64] os: [win32] @@ -3739,8 +3748,8 @@ packages: engines: {node: '>=12'} hasBin: true - esbuild@0.25.5: - resolution: {integrity: sha512-P8OtKZRv/5J5hhz0cUAdu/cLuPIKXpQl1R9pZtvmHWQvrAUVd0UNIPT4IB4W3rNOqVO0rlqHmCIbSwxh/c9yUQ==} + esbuild@0.25.6: + resolution: {integrity: sha512-GVuzuUwtdsghE3ocJ9Bs8PNoF13HNQ5TXbEi2AhvVb8xU1Iwt9Fos9FEamfoee+u/TOsn7GUWc04lz46n2bbTg==} engines: {node: '>=18'} hasBin: true @@ -4723,6 +4732,9 @@ packages: node-notifier: optional: true + jieba-wasm@2.2.0: + resolution: {integrity: sha512-IwxgUf+EMutjLair3k41i0ApM33qeHNY9EFBKlI5/XtHcISkGt5YPmUvpDJe3hUflwRYhy9g29ZzTetGZw6XgQ==} + js-base64@3.7.7: resolution: {integrity: sha512-7rCnleh0z2CkXhH67J8K1Ytz0b2Y+yxTPL+/KOJoa20hfnVQ/3/T6W/KflYI4bRHRagNeXeU2bkNGI3v1oS/lw==} @@ -7089,7 +7101,7 @@ snapshots: '@lezer/lr': 1.4.2 style-mod: 4.1.2 - '@codemirror/language@https://codeload.github.com/lishid/cm-language/tar.gz/6c1c5f5b677f6f6503d1ca2ec47f62f6406cda67': + '@codemirror/language@https://codeload.github.com/lishid/cm-language/tar.gz/a9c3c7efe17dd1d24395ee2a179fe12dd6ed1e76': dependencies: '@codemirror/state': 6.5.2 '@codemirror/view': 6.38.0 @@ -7190,7 +7202,7 @@ snapshots: '@esbuild/aix-ppc64@0.19.12': optional: true - '@esbuild/aix-ppc64@0.25.5': + '@esbuild/aix-ppc64@0.25.6': optional: true '@esbuild/android-arm64@0.17.3': @@ -7202,7 +7214,7 @@ snapshots: '@esbuild/android-arm64@0.19.12': optional: true - '@esbuild/android-arm64@0.25.5': + '@esbuild/android-arm64@0.25.6': optional: true '@esbuild/android-arm@0.17.3': @@ -7214,7 +7226,7 @@ snapshots: '@esbuild/android-arm@0.19.12': optional: true - '@esbuild/android-arm@0.25.5': + '@esbuild/android-arm@0.25.6': optional: true '@esbuild/android-x64@0.17.3': @@ -7226,7 +7238,7 @@ snapshots: '@esbuild/android-x64@0.19.12': optional: true - '@esbuild/android-x64@0.25.5': + '@esbuild/android-x64@0.25.6': optional: true '@esbuild/darwin-arm64@0.17.3': @@ -7238,7 +7250,7 @@ snapshots: '@esbuild/darwin-arm64@0.19.12': optional: true - '@esbuild/darwin-arm64@0.25.5': + '@esbuild/darwin-arm64@0.25.6': optional: true '@esbuild/darwin-x64@0.17.3': @@ -7250,7 +7262,7 @@ snapshots: '@esbuild/darwin-x64@0.19.12': optional: true - '@esbuild/darwin-x64@0.25.5': + '@esbuild/darwin-x64@0.25.6': optional: true '@esbuild/freebsd-arm64@0.17.3': @@ -7262,7 +7274,7 @@ snapshots: '@esbuild/freebsd-arm64@0.19.12': optional: true - '@esbuild/freebsd-arm64@0.25.5': + '@esbuild/freebsd-arm64@0.25.6': optional: true '@esbuild/freebsd-x64@0.17.3': @@ -7274,7 +7286,7 @@ snapshots: '@esbuild/freebsd-x64@0.19.12': optional: true - '@esbuild/freebsd-x64@0.25.5': + '@esbuild/freebsd-x64@0.25.6': optional: true '@esbuild/linux-arm64@0.17.3': @@ -7286,7 +7298,7 @@ snapshots: '@esbuild/linux-arm64@0.19.12': optional: true - '@esbuild/linux-arm64@0.25.5': + '@esbuild/linux-arm64@0.25.6': optional: true '@esbuild/linux-arm@0.17.3': @@ -7298,7 +7310,7 @@ snapshots: '@esbuild/linux-arm@0.19.12': optional: true - '@esbuild/linux-arm@0.25.5': + '@esbuild/linux-arm@0.25.6': optional: true '@esbuild/linux-ia32@0.17.3': @@ -7310,7 +7322,7 @@ snapshots: '@esbuild/linux-ia32@0.19.12': optional: true - '@esbuild/linux-ia32@0.25.5': + '@esbuild/linux-ia32@0.25.6': optional: true '@esbuild/linux-loong64@0.17.3': @@ -7322,7 +7334,7 @@ snapshots: '@esbuild/linux-loong64@0.19.12': optional: true - '@esbuild/linux-loong64@0.25.5': + '@esbuild/linux-loong64@0.25.6': optional: true '@esbuild/linux-mips64el@0.17.3': @@ -7334,7 +7346,7 @@ snapshots: '@esbuild/linux-mips64el@0.19.12': optional: true - '@esbuild/linux-mips64el@0.25.5': + '@esbuild/linux-mips64el@0.25.6': optional: true '@esbuild/linux-ppc64@0.17.3': @@ -7346,7 +7358,7 @@ snapshots: '@esbuild/linux-ppc64@0.19.12': optional: true - '@esbuild/linux-ppc64@0.25.5': + '@esbuild/linux-ppc64@0.25.6': optional: true '@esbuild/linux-riscv64@0.17.3': @@ -7358,7 +7370,7 @@ snapshots: '@esbuild/linux-riscv64@0.19.12': optional: true - '@esbuild/linux-riscv64@0.25.5': + '@esbuild/linux-riscv64@0.25.6': optional: true '@esbuild/linux-s390x@0.17.3': @@ -7370,7 +7382,7 @@ snapshots: '@esbuild/linux-s390x@0.19.12': optional: true - '@esbuild/linux-s390x@0.25.5': + '@esbuild/linux-s390x@0.25.6': optional: true '@esbuild/linux-x64@0.17.3': @@ -7382,10 +7394,10 @@ snapshots: '@esbuild/linux-x64@0.19.12': optional: true - '@esbuild/linux-x64@0.25.5': + '@esbuild/linux-x64@0.25.6': optional: true - '@esbuild/netbsd-arm64@0.25.5': + '@esbuild/netbsd-arm64@0.25.6': optional: true '@esbuild/netbsd-x64@0.17.3': @@ -7397,10 +7409,10 @@ snapshots: '@esbuild/netbsd-x64@0.19.12': optional: true - '@esbuild/netbsd-x64@0.25.5': + '@esbuild/netbsd-x64@0.25.6': optional: true - '@esbuild/openbsd-arm64@0.25.5': + '@esbuild/openbsd-arm64@0.25.6': optional: true '@esbuild/openbsd-x64@0.17.3': @@ -7412,7 +7424,10 @@ snapshots: '@esbuild/openbsd-x64@0.19.12': optional: true - '@esbuild/openbsd-x64@0.25.5': + '@esbuild/openbsd-x64@0.25.6': + optional: true + + '@esbuild/openharmony-arm64@0.25.6': optional: true '@esbuild/sunos-x64@0.17.3': @@ -7424,7 +7439,7 @@ snapshots: '@esbuild/sunos-x64@0.19.12': optional: true - '@esbuild/sunos-x64@0.25.5': + '@esbuild/sunos-x64@0.25.6': optional: true '@esbuild/win32-arm64@0.17.3': @@ -7436,7 +7451,7 @@ snapshots: '@esbuild/win32-arm64@0.19.12': optional: true - '@esbuild/win32-arm64@0.25.5': + '@esbuild/win32-arm64@0.25.6': optional: true '@esbuild/win32-ia32@0.17.3': @@ -7448,7 +7463,7 @@ snapshots: '@esbuild/win32-ia32@0.19.12': optional: true - '@esbuild/win32-ia32@0.25.5': + '@esbuild/win32-ia32@0.25.6': optional: true '@esbuild/win32-x64@0.17.3': @@ -7460,7 +7475,7 @@ snapshots: '@esbuild/win32-x64@0.19.12': optional: true - '@esbuild/win32-x64@0.25.5': + '@esbuild/win32-x64@0.25.6': optional: true '@eslint-community/eslint-utils@4.7.0(eslint@8.57.1)': @@ -10339,7 +10354,7 @@ snapshots: esbuild-plugin-inline-worker@0.1.1: dependencies: - esbuild: 0.25.5 + esbuild: 0.25.6 find-cache-dir: 3.3.2 esbuild-register@3.6.0(esbuild@0.19.12): @@ -10425,33 +10440,34 @@ snapshots: '@esbuild/win32-ia32': 0.19.12 '@esbuild/win32-x64': 0.19.12 - esbuild@0.25.5: + esbuild@0.25.6: optionalDependencies: - '@esbuild/aix-ppc64': 0.25.5 - '@esbuild/android-arm': 0.25.5 - '@esbuild/android-arm64': 0.25.5 - '@esbuild/android-x64': 0.25.5 - '@esbuild/darwin-arm64': 0.25.5 - '@esbuild/darwin-x64': 0.25.5 - '@esbuild/freebsd-arm64': 0.25.5 - '@esbuild/freebsd-x64': 0.25.5 - '@esbuild/linux-arm': 0.25.5 - '@esbuild/linux-arm64': 0.25.5 - '@esbuild/linux-ia32': 0.25.5 - '@esbuild/linux-loong64': 0.25.5 - '@esbuild/linux-mips64el': 0.25.5 - '@esbuild/linux-ppc64': 0.25.5 - '@esbuild/linux-riscv64': 0.25.5 - '@esbuild/linux-s390x': 0.25.5 - '@esbuild/linux-x64': 0.25.5 - '@esbuild/netbsd-arm64': 0.25.5 - '@esbuild/netbsd-x64': 0.25.5 - '@esbuild/openbsd-arm64': 0.25.5 - '@esbuild/openbsd-x64': 0.25.5 - '@esbuild/sunos-x64': 0.25.5 - '@esbuild/win32-arm64': 0.25.5 - '@esbuild/win32-ia32': 0.25.5 - '@esbuild/win32-x64': 0.25.5 + '@esbuild/aix-ppc64': 0.25.6 + '@esbuild/android-arm': 0.25.6 + '@esbuild/android-arm64': 0.25.6 + '@esbuild/android-x64': 0.25.6 + '@esbuild/darwin-arm64': 0.25.6 + '@esbuild/darwin-x64': 0.25.6 + '@esbuild/freebsd-arm64': 0.25.6 + '@esbuild/freebsd-x64': 0.25.6 + '@esbuild/linux-arm': 0.25.6 + '@esbuild/linux-arm64': 0.25.6 + '@esbuild/linux-ia32': 0.25.6 + '@esbuild/linux-loong64': 0.25.6 + '@esbuild/linux-mips64el': 0.25.6 + '@esbuild/linux-ppc64': 0.25.6 + '@esbuild/linux-riscv64': 0.25.6 + '@esbuild/linux-s390x': 0.25.6 + '@esbuild/linux-x64': 0.25.6 + '@esbuild/netbsd-arm64': 0.25.6 + '@esbuild/netbsd-x64': 0.25.6 + '@esbuild/openbsd-arm64': 0.25.6 + '@esbuild/openbsd-x64': 0.25.6 + '@esbuild/openharmony-arm64': 0.25.6 + '@esbuild/sunos-x64': 0.25.6 + '@esbuild/win32-arm64': 0.25.6 + '@esbuild/win32-ia32': 0.25.6 + '@esbuild/win32-x64': 0.25.6 escalade@3.2.0: {} @@ -11789,6 +11805,8 @@ snapshots: - supports-color - ts-node + jieba-wasm@2.2.0: {} + js-base64@3.7.7: {} js-tiktoken@1.0.20: @@ -12603,7 +12621,7 @@ snapshots: obsidian-dataview@0.5.68: dependencies: - '@codemirror/language': https://codeload.github.com/lishid/cm-language/tar.gz/6c1c5f5b677f6f6503d1ca2ec47f62f6406cda67 + '@codemirror/language': https://codeload.github.com/lishid/cm-language/tar.gz/a9c3c7efe17dd1d24395ee2a179fe12dd6ed1e76 '@codemirror/state': 6.5.2 '@codemirror/view': 6.38.0 emoji-regex: 10.4.0 diff --git a/src/core/rag/rag-engine.ts b/src/core/rag/rag-engine.ts index f4ace14..a873c0c 100644 --- a/src/core/rag/rag-engine.ts +++ b/src/core/rag/rag-engine.ts @@ -163,7 +163,7 @@ export class RAGEngine { ) } - async processQuery({ + async processSimilarityQuery({ query, scope, limit, @@ -211,6 +211,221 @@ export class RAGEngine { return queryResult } + async processQuery({ + query, + scope, + limit, + language, + onQueryProgressChange, + }: { + query: string + scope?: { + files: string[] + folders: string[] + } + limit?: number + language?: string + onQueryProgressChange?: (queryProgress: QueryProgressState) => void + }): Promise< + (Omit & { + similarity: number + })[] + > { + if (!this.embeddingModel) { + throw new Error('Embedding model is not set') + } + + await this.initializeDimension() + + onQueryProgressChange?.({ + type: 'querying', + }) + + // 并行执行相似度搜索和全文搜索 + const [similarityResults, fulltextResults] = await Promise.all([ + this.processSimilarityQuery({ + query, + scope, + limit, + onQueryProgressChange: undefined, // 避免重复触发进度回调 + }), + this.processFulltextQuery({ + query, + scope, + limit, + language, + onQueryProgressChange: undefined, // 避免重复触发进度回调 + }), + ]) + + // 优化:如果其中一个搜索结果为空,直接返回另一个结果 + let finalResults: (Omit & { similarity: number })[] + + if (fulltextResults.length === 0) { + // 全文搜索结果为空,直接返回相似度搜索结果 + finalResults = similarityResults + } else if (similarityResults.length === 0) { + // 相似度搜索结果为空,直接返回全文搜索结果(转换格式) + finalResults = fulltextResults.map(result => ({ + ...result, + similarity: 1 - (result.rank - 1) / fulltextResults.length, // 将rank转换为相似度分数 + })) + } else { + // 两个搜索都有结果,使用 RRF 算法合并 + const rrf_k = 60 // RRF 常数 + const mergedResults = this.mergeWithRRF(similarityResults, fulltextResults, rrf_k) + + // 转换为与现有接口兼容的格式 + finalResults = mergedResults.map(result => ({ + ...result, + similarity: result.rrfScore, // 使用 RRF 分数作为相似度 + })) + } + + onQueryProgressChange?.({ + type: 'querying-done', + queryResult: finalResults, + }) + + return finalResults + } + + /** + * 使用倒数排名融合(RRF)算法合并相似度搜索和全文搜索结果 + * @param similarityResults 相似度搜索结果 + * @param fulltextResults 全文搜索结果 + * @param k RRF 常数,通常为 60 + * @returns 合并后的结果,按 RRF 分数排序 + */ + private mergeWithRRF( + similarityResults: (Omit & { similarity: number })[], + fulltextResults: (Omit & { rank: number })[], + k: number = 60 + ): (Omit & { rrfScore: number })[] { + // 创建一个 Map 来存储每个文档的 RRF 分数 + const rrfScores = new Map, + score: number + }>() + + // 处理相似度搜索结果 + similarityResults.forEach((result, index) => { + const key = `${result.path}-${result.id}` + const rank = index + 1 + const rrfScore = 1 / (k + rank) + + if (rrfScores.has(key)) { + const existing = rrfScores.get(key) + if (existing) { + existing.score += rrfScore + } + } else { + rrfScores.set(key, { + doc: { + id: result.id, + path: result.path, + mtime: result.mtime, + content: result.content, + metadata: result.metadata, + }, + score: rrfScore + }) + } + }) + + // 处理全文搜索结果 + fulltextResults.forEach((result, index) => { + const key = `${result.path}-${result.id}` + const rank = index + 1 + const rrfScore = 1 / (k + rank) + + if (rrfScores.has(key)) { + const existing = rrfScores.get(key) + if (existing) { + existing.score += rrfScore + } + } else { + rrfScores.set(key, { + doc: { + id: result.id, + path: result.path, + mtime: result.mtime, + content: result.content, + metadata: result.metadata, + }, + score: rrfScore + }) + } + }) + + // 转换为数组并进行归一化处理 + const results = Array.from(rrfScores.values()) + + // 找到最大分数用于归一化 + const maxScore = Math.max(...results.map(r => r.score)) + + // 归一化到 0~1 范围并按分数排序 + const mergedResults = results + .map(({ doc, score }) => ({ + ...doc, + rrfScore: maxScore > 0 ? score / maxScore : 0 // 归一化到 0~1 + })) + .sort((a, b) => b.rrfScore - a.rrfScore) + + return mergedResults + } + + async processFulltextQuery({ + query, + scope, + limit, + language, + onQueryProgressChange, + }: { + query: string + scope?: { + files: string[] + folders: string[] + } + limit?: number + language?: string + onQueryProgressChange?: (queryProgress: QueryProgressState) => void + }): Promise< + (Omit & { + rank: number + })[] + > { + if (!this.embeddingModel) { + throw new Error('Embedding model is not set') + } + + await this.initializeDimension() + + onQueryProgressChange?.({ + type: 'querying', + }) + + const queryResult = await this.vectorManager.performFulltextSearch( + query, + this.embeddingModel, + { + limit: limit ?? this.settings.ragOptions.limit, + scope, + language: language || 'english', + }, + ) + + onQueryProgressChange?.({ + type: 'querying-done', + queryResult: queryResult.map(result => ({ + ...result, + similarity: result.rank, // 为了兼容 QueryProgressState 类型 + })), + }) + + return queryResult + } + async getEmbedding(query: string): Promise { if (!this.embeddingModel) { throw new Error('Embedding model is not set') diff --git a/src/database/modules/vector/vector-manager.ts b/src/database/modules/vector/vector-manager.ts index 1100a88..d7440a2 100644 --- a/src/database/modules/vector/vector-manager.ts +++ b/src/database/modules/vector/vector-manager.ts @@ -33,6 +33,71 @@ export class VectorManager { this.repository = new VectorRepository(app, dbManager.getPgClient() as any) } + // 添加合并小chunks的辅助方法(仅在同一文件内合并) + private mergeSmallChunks(chunks: { pageContent: string; metadata: any }[], minChunkSize: number): typeof chunks { + if (!chunks || chunks.length === 0) { + return [] + } + + const mergedChunks: typeof chunks = [] + let currentChunkBuffer = "" + let currentMetadata: any = null + + for (const chunk of chunks) { + const content = chunk.pageContent.trim() + if (content.length === 0) continue + + // 将当前块加入缓冲区 + const combined = currentChunkBuffer ? `${currentChunkBuffer} ${content}` : content + + // 更新metadata,记录起始和结束位置 + const combinedMetadata = currentMetadata ? { + ...currentMetadata, + endLine: chunk.metadata?.loc?.lines?.to || chunk.metadata?.endLine || currentMetadata.endLine + } : { + ...chunk.metadata, + startLine: chunk.metadata?.loc?.lines?.from || chunk.metadata?.startLine, + endLine: chunk.metadata?.loc?.lines?.to || chunk.metadata?.endLine + } + + if (combined.length < minChunkSize) { + // 如果组合后仍然太小,则更新缓冲区并继续循环 + currentChunkBuffer = combined + currentMetadata = combinedMetadata + } else { + // 如果组合后达到或超过最小尺寸,将其推入最终数组,并清空缓冲区 + mergedChunks.push({ + pageContent: combined, + metadata: combinedMetadata + }) + currentChunkBuffer = "" + currentMetadata = null + } + } + + // 处理循环结束后缓冲区里可能剩下的最后一个小块 + if (currentChunkBuffer) { + if (mergedChunks.length > 0) { + // 策略1:如果缓冲区有内容,将其合并到最后一个块中 + const lastChunk = mergedChunks[mergedChunks.length - 1] + lastChunk.pageContent += ` ${currentChunkBuffer}` + lastChunk.metadata.endLine = currentMetadata?.endLine || lastChunk.metadata.endLine + } else { + // 策略2:或者如果就没有足够大的块,把它自己作为一个块 + mergedChunks.push({ + pageContent: currentChunkBuffer, + metadata: currentMetadata + }) + } + } + console.log("mergedChunks: ", mergedChunks) + return mergedChunks + } + + private segmentTextForTsvector(text: string): string { + return this.repository.segmentTextForTsvector(text) + } + async performSimilaritySearch( queryVector: number[], embeddingModel: EmbeddingModel, @@ -56,6 +121,29 @@ export class VectorManager { ) } + async performFulltextSearch( + searchQuery: string, + embeddingModel: EmbeddingModel, + options: { + limit: number + scope?: { + files: string[] + folders: string[] + } + language?: string + }, + ): Promise< + (Omit & { + rank: number + })[] + > { + return await this.repository.performFulltextSearch( + searchQuery, + embeddingModel, + options, + ) + } + async getWorkspaceStatistics( embeddingModel: EmbeddingModel, workspace?: Workspace @@ -197,7 +285,10 @@ export class VectorManager { "", ], }); - console.log("textSplitter chunkSize: ", options.chunkSize, "overlap: ", overlap) + + // 设置最小chunk大小,防止产生太小的chunks + const minChunkSize = Math.max(100, Math.floor(options.chunkSize * 0.3)); // 最小50字符或chunk_size的50% + console.log("textSplitter chunkSize: ", options.chunkSize, "overlap: ", overlap, "minChunkSize: ", minChunkSize) const skippedFiles: string[] = [] const embeddingProgress = { completed: 0, totalChunks: 0 } @@ -205,7 +296,7 @@ export class VectorManager { // 分批处理文件,每批最多50个文件(减少以避免文件句柄耗尽) const FILE_BATCH_SIZE = 50 // 减少批量大小以降低内存压力 - const embeddingBatchSize = Math.min(options.batchSize, 10) + const embeddingBatchSize = options.batchSize // 首先统计总的分块数量用于进度显示 let totalChunks = 0 @@ -216,7 +307,13 @@ export class VectorManager { let fileContent = await this.app.vault.cachedRead(file) fileContent = fileContent.replace(/\0/g, '') const fileDocuments = await textSplitter.createDocuments([fileContent]) - totalChunks += fileDocuments.length + // 统计阶段也需要使用相同的清理和合并逻辑 + const cleanedChunks = fileDocuments.map(chunk => ({ + pageContent: removeMarkdown(chunk.pageContent).replace(/\0/g, '').trim(), + metadata: chunk.metadata + })).filter(chunk => chunk.pageContent.length > 0) + const filteredDocuments = this.mergeSmallChunks(cleanedChunks, minChunkSize) + totalChunks += filteredDocuments.length } catch (error) { // 统计阶段跳过错误文件 } @@ -246,21 +343,30 @@ export class VectorManager { const fileDocuments = await textSplitter.createDocuments([ fileContent, ]) - return fileDocuments + + // 先清理每个chunk的内容,然后基于清理后的内容进行合并 + const cleanedChunks = fileDocuments.map(chunk => ({ + pageContent: removeMarkdown(chunk.pageContent).replace(/\0/g, '').trim(), + metadata: chunk.metadata + })).filter(chunk => chunk.pageContent.length > 0) + + const filteredDocuments = this.mergeSmallChunks(cleanedChunks, minChunkSize) + return filteredDocuments .map((chunk): InsertVector | null => { - // 保存原始内容,不在此处调用 removeMarkdown - const rawContent = chunk.pageContent.replace(/\0/g, '') - if (!rawContent || rawContent.trim().length === 0) { + const cleanContent = chunk.pageContent + if (!cleanContent || cleanContent.trim().length === 0) { return null } + // Use Intl.Segmenter to add spaces for better TSVECTOR indexing + const segmentedContent = this.segmentTextForTsvector(cleanContent) return { path: file.path, mtime: file.stat.mtime, - content: rawContent, // 保存原始内容 + content: segmentedContent, // 使用分词后的内容 embedding: [], metadata: { - startLine: Number(chunk.metadata.loc.lines.from), - endLine: Number(chunk.metadata.loc.lines.to), + startLine: Number(chunk.metadata.loc?.lines?.from || chunk.metadata.startLine), + endLine: Number(chunk.metadata.loc?.lines?.to || chunk.metadata.endLine), }, } }) @@ -280,7 +386,6 @@ export class VectorManager { // 第二步:嵌入处理 console.log(`Embedding ${batchChunks.length} chunks for current file batch`) - if (embeddingModel.supportsBatch) { // 支持批量处理的提供商 for (let j = 0; j < batchChunks.length; j += embeddingBatchSize) { @@ -289,26 +394,25 @@ export class VectorManager { await backOff( async () => { - // 在嵌入之前处理 markdown - const cleanedBatchData = embeddingBatch.map(chunk => { - const cleanContent = removeMarkdown(chunk.content) - return { chunk, cleanContent } - }).filter(({ cleanContent }) => cleanContent && cleanContent.trim().length > 0) + // 内容已经在前面清理和合并过了,直接使用 + const validBatchData = embeddingBatch.filter(chunk => + chunk.content && chunk.content.trim().length > 0 + ) - if (cleanedBatchData.length === 0) { + if (validBatchData.length === 0) { return } - const batchTexts = cleanedBatchData.map(({ cleanContent }) => cleanContent) + const batchTexts = validBatchData.map(chunk => chunk.content) const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts) // 合并embedding结果到chunk数据 - for (let k = 0; k < cleanedBatchData.length; k++) { - const { chunk, cleanContent } = cleanedBatchData[k] + for (let k = 0; k < validBatchData.length; k++) { + const chunk = validBatchData[k] const embeddedChunk: InsertVector = { path: chunk.path, mtime: chunk.mtime, - content: cleanContent, // 使用已经清理过的内容 + content: chunk.content, // 使用已经清理和合并后的内容 embedding: batchEmbeddings[k], metadata: chunk.metadata, } @@ -349,18 +453,18 @@ export class VectorManager { try { await backOff( async () => { - // 在嵌入之前处理 markdown - const cleanContent = removeMarkdown(chunk.content).replace(/\0/g, '') - // 跳过清理后为空的内容 - if (!cleanContent || cleanContent.trim().length === 0) { + // 内容已经在前面清理和合并过了,直接使用 + const content = chunk.content.trim() + // 跳过空内容 + if (!content || content.length === 0) { return } - const embedding = await embeddingModel.getEmbedding(cleanContent) + const embedding = await embeddingModel.getEmbedding(content) const embeddedChunk = { path: chunk.path, mtime: chunk.mtime, - content: cleanContent, // 使用清理后的内容 + content: content, // 使用已经清理和合并后的内容 embedding, metadata: chunk.metadata, } @@ -495,7 +599,10 @@ export class VectorManager { "", ], }); - console.log("textSplitter chunkSize: ", options.chunkSize, "overlap: ", overlap) + + // 设置最小chunk大小,防止产生太小的chunks + const minChunkSize = Math.max(100, Math.floor(options.chunkSize * 0.5)); // 最小50字符或chunk_size的10% + console.log("textSplitter chunkSize: ", options.chunkSize, "overlap: ", overlap, "minChunkSize: ", minChunkSize) const skippedFiles: string[] = [] const embeddingProgress = { completed: 0, totalChunks: 0 } @@ -503,7 +610,7 @@ export class VectorManager { // 分批处理文件,每批最多50个文件(减少以避免文件句柄耗尽) const FILE_BATCH_SIZE = 50 // 减少批量大小以降低内存压力 - const embeddingBatchSize = Math.min(options.batchSize, 10) + const embeddingBatchSize = options.batchSize // 首先统计总的分块数量用于进度显示 let totalChunks = 0 @@ -514,7 +621,13 @@ export class VectorManager { let fileContent = await this.app.vault.cachedRead(file) fileContent = fileContent.replace(/\0/g, '') const fileDocuments = await textSplitter.createDocuments([fileContent]) - totalChunks += fileDocuments.length + // 统计阶段也需要使用相同的清理和合并逻辑 + const cleanedChunks = fileDocuments.map(chunk => ({ + pageContent: removeMarkdown(chunk.pageContent).replace(/\0/g, '').trim(), + metadata: chunk.metadata + })).filter(chunk => chunk.pageContent.length > 0) + const filteredDocuments = this.mergeSmallChunks(cleanedChunks, minChunkSize) + totalChunks += filteredDocuments.length } catch (error) { // 统计阶段跳过错误文件 } @@ -544,21 +657,30 @@ export class VectorManager { const fileDocuments = await textSplitter.createDocuments([ fileContent, ]) - return fileDocuments + + // 先清理每个chunk的内容,然后基于清理后的内容进行合并 + const cleanedChunks = fileDocuments.map(chunk => ({ + pageContent: removeMarkdown(chunk.pageContent).replace(/\0/g, '').trim(), + metadata: chunk.metadata + })).filter(chunk => chunk.pageContent.length > 0) + + const filteredDocuments = this.mergeSmallChunks(cleanedChunks, minChunkSize) + return filteredDocuments .map((chunk): InsertVector | null => { - // 保存原始内容,不在此处调用 removeMarkdown - const rawContent = chunk.pageContent.replace(/\0/g, '') - if (!rawContent || rawContent.trim().length === 0) { + const cleanContent = chunk.pageContent + if (!cleanContent || cleanContent.trim().length === 0) { return null } + // Use Intl.Segmenter to add spaces for better TSVECTOR indexing + const segmentedContent = this.segmentTextForTsvector(cleanContent) return { path: file.path, mtime: file.stat.mtime, - content: rawContent, // 保存原始内容 + content: segmentedContent, // 使用分词后的内容 embedding: [], metadata: { - startLine: Number(chunk.metadata.loc.lines.from), - endLine: Number(chunk.metadata.loc.lines.to), + startLine: Number(chunk.metadata.loc?.lines?.from || chunk.metadata.startLine), + endLine: Number(chunk.metadata.loc?.lines?.to || chunk.metadata.endLine), }, } }) @@ -581,32 +703,35 @@ export class VectorManager { if (embeddingModel.supportsBatch) { // 支持批量处理的提供商 + console.log("batchChunks", batchChunks.map((chunk, index) => ({ + index, + contentLength: chunk.content.length, + }))) for (let j = 0; j < batchChunks.length; j += embeddingBatchSize) { const embeddingBatch = batchChunks.slice(j, Math.min(j + embeddingBatchSize, batchChunks.length)) const embeddedBatch: InsertVector[] = [] await backOff( async () => { - // 在嵌入之前处理 markdown - const cleanedBatchData = embeddingBatch.map(chunk => { - const cleanContent = removeMarkdown(chunk.content) - return { chunk, cleanContent } - }).filter(({ cleanContent }) => cleanContent && cleanContent.trim().length > 0) + // 内容已经在前面清理和合并过了,直接使用 + const validBatchData = embeddingBatch.filter(chunk => + chunk.content && chunk.content.trim().length > 0 + ) - if (cleanedBatchData.length === 0) { + if (validBatchData.length === 0) { return } - const batchTexts = cleanedBatchData.map(({ cleanContent }) => cleanContent) + const batchTexts = validBatchData.map(chunk => chunk.content) const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts) // 合并embedding结果到chunk数据 - for (let k = 0; k < cleanedBatchData.length; k++) { - const { chunk, cleanContent } = cleanedBatchData[k] + for (let k = 0; k < validBatchData.length; k++) { + const chunk = validBatchData[k] const embeddedChunk: InsertVector = { path: chunk.path, mtime: chunk.mtime, - content: cleanContent, // 使用已经清理过的内容 + content: chunk.content, // 使用已经清理和合并后的内容 embedding: batchEmbeddings[k], metadata: chunk.metadata, } @@ -647,18 +772,18 @@ export class VectorManager { try { await backOff( async () => { - // 在嵌入之前处理 markdown - const cleanContent = removeMarkdown(chunk.content).replace(/\0/g, '') - // 跳过清理后为空的内容 - if (!cleanContent || cleanContent.trim().length === 0) { + // 内容已经在前面清理和合并过了,直接使用 + const content = chunk.content.trim() + // 跳过空内容 + if (!content || content.length === 0) { return } - const embedding = await embeddingModel.getEmbedding(cleanContent) + const embedding = await embeddingModel.getEmbedding(content) const embeddedChunk = { path: chunk.path, mtime: chunk.mtime, - content: cleanContent, // 使用清理后的内容 + content: content, // 使用已经清理和合并后的内容 embedding, metadata: chunk.metadata, } @@ -756,28 +881,41 @@ export class VectorManager { "", ], }); + + // 设置最小chunk大小,防止产生太小的chunks + const minChunkSize = Math.max(50, Math.floor(chunkSize * 0.1)); // 最小50字符或chunk_size的10% + let fileContent = await this.app.vault.cachedRead(file) // 清理null字节,防止PostgreSQL UTF8编码错误 fileContent = fileContent.replace(/\0/g, '') const fileDocuments = await textSplitter.createDocuments([ fileContent, ]) + + // 先清理每个chunk的内容,然后基于清理后的内容进行合并 + const cleanedChunks = fileDocuments.map(chunk => ({ + pageContent: removeMarkdown(chunk.pageContent).replace(/\0/g, '').trim(), + metadata: chunk.metadata + })).filter(chunk => chunk.pageContent.length > 0) + + const filteredDocuments = this.mergeSmallChunks(cleanedChunks, minChunkSize) - const contentChunks: InsertVector[] = fileDocuments + const contentChunks: InsertVector[] = filteredDocuments .map((chunk): InsertVector | null => { - // 保存原始内容,不在此处调用 removeMarkdown - const rawContent = String(chunk.pageContent || '').replace(/\0/g, '') - if (!rawContent || rawContent.trim().length === 0) { + const cleanContent = chunk.pageContent + if (!cleanContent || cleanContent.trim().length === 0) { return null } + // Use Intl.Segmenter to add spaces for better TSVECTOR indexing + const segmentedContent = this.segmentTextForTsvector(cleanContent) return { path: file.path, mtime: file.stat.mtime, - content: rawContent, // 保存原始内容 + content: segmentedContent, // 使用分词后的内容 embedding: [], metadata: { - startLine: Number(chunk.metadata.loc.lines.from), - endLine: Number(chunk.metadata.loc.lines.to), + startLine: Number(chunk.metadata.loc?.lines?.from || chunk.metadata.startLine), + endLine: Number(chunk.metadata.loc?.lines?.to || chunk.metadata.endLine), }, } }) @@ -795,34 +933,33 @@ export class VectorManager { const embeddedBatch: InsertVector[] = [] - await backOff( - async () => { - // 在嵌入之前处理 markdown,只处理一次 - const cleanedBatchData = batchChunks.map(chunk => { - const cleanContent = removeMarkdown(chunk.content).replace(/\0/g, '') - return { chunk, cleanContent } - }).filter(({ cleanContent }) => cleanContent && cleanContent.trim().length > 0) + await backOff( + async () => { + // 内容已经在前面清理和合并过了,直接使用 + const validBatchData = batchChunks.filter(chunk => + chunk.content && chunk.content.trim().length > 0 + ) - if (cleanedBatchData.length === 0) { - return - } - - const batchTexts = cleanedBatchData.map(({ cleanContent }) => cleanContent) - const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts) - - // 合并embedding结果到chunk数据 - for (let j = 0; j < cleanedBatchData.length; j++) { - const { chunk, cleanContent } = cleanedBatchData[j] - const embeddedChunk: InsertVector = { - path: chunk.path, - mtime: chunk.mtime, - content: cleanContent, // 使用已经清理过的内容 - embedding: batchEmbeddings[j], - metadata: chunk.metadata, + if (validBatchData.length === 0) { + return } - embeddedBatch.push(embeddedChunk) - } - }, + + const batchTexts = validBatchData.map(chunk => chunk.content) + const batchEmbeddings = await embeddingModel.getBatchEmbeddings(batchTexts) + + // 合并embedding结果到chunk数据 + for (let j = 0; j < validBatchData.length; j++) { + const chunk = validBatchData[j] + const embeddedChunk: InsertVector = { + path: chunk.path, + mtime: chunk.mtime, + content: chunk.content, // 使用已经清理和合并后的内容 + embedding: batchEmbeddings[j], + metadata: chunk.metadata, + } + embeddedBatch.push(embeddedChunk) + } + }, { numOfAttempts: 3, // 减少重试次数 startingDelay: 500, // 减少延迟 @@ -864,18 +1001,18 @@ export class VectorManager { try { await backOff( async () => { - // 在嵌入之前处理 markdown - const cleanContent = removeMarkdown(chunk.content).replace(/\0/g, '') - // 跳过清理后为空的内容 - if (!cleanContent || cleanContent.trim().length === 0) { + // 内容已经在前面清理和合并过了,直接使用 + const content = chunk.content.trim() + // 跳过空内容 + if (!content || content.length === 0) { return } - const embedding = await embeddingModel.getEmbedding(cleanContent) + const embedding = await embeddingModel.getEmbedding(content) const embeddedChunk = { path: chunk.path, mtime: chunk.mtime, - content: cleanContent, // 使用清理后的内容 + content: content, // 使用已经清理和合并后的内容 embedding, metadata: chunk.metadata, } diff --git a/src/database/modules/vector/vector-repository.ts b/src/database/modules/vector/vector-repository.ts index c989ed3..f80f026 100644 --- a/src/database/modules/vector/vector-repository.ts +++ b/src/database/modules/vector/vector-repository.ts @@ -6,174 +6,208 @@ import { DatabaseNotInitializedException } from '../../exception' import { InsertVector, SelectVector, vectorTables } from '../../schema' export class VectorRepository { - private app: App - private db: PGliteInterface | null + private app: App + private db: PGliteInterface | null + private stopWords: Set - constructor(app: App, pgClient: PGliteInterface | null) { - this.app = app - this.db = pgClient - } + constructor(app: App, pgClient: PGliteInterface | null) { + this.app = app + this.db = pgClient + this.stopWords = new Set([ + // Chinese stop words + '的', '在', '是', '了', '我', '你', '他', '她', '它', '请问', '如何', '一个', '什么', '怎么', + '这', '那', '和', '与', '或', '但', '因为', '所以', '如果', '虽然', '可是', '不过', + '也', '都', '还', '就', '又', '很', '最', '更', '非常', '特别', '比较', '相当', + '对', '于', '把', '被', '让', '使', '给', '为', '从', '到', '向', '往', '朝', + '上', '下', '里', '外', '前', '后', '左', '右', '中', '间', '内', '以', '及', - private getTableName(embeddingModel: EmbeddingModel): string { - const tableDefinition = vectorTables[embeddingModel.dimension] - if (!tableDefinition) { - throw new Error(`No table definition found for model: ${embeddingModel.id}`) - } - return tableDefinition.name - } + // English stop words + 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he', + 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will', + 'with', 'would', 'could', 'should', 'can', 'may', 'might', 'must', 'shall', + 'this', 'that', 'these', 'those', 'i', 'you', 'we', 'they', 'me', 'him', 'her', + 'us', 'them', 'my', 'your', 'his', 'our', 'their', 'am', 'have', 'had', 'do', + 'does', 'did', 'get', 'got', 'go', 'went', 'come', 'came', 'make', 'made', + 'take', 'took', 'see', 'saw', 'know', 'knew', 'think', 'thought', 'say', 'said', + 'tell', 'told', 'ask', 'asked', 'give', 'gave', 'find', 'found', 'work', 'worked', + 'call', 'called', 'try', 'tried', 'need', 'needed', 'feel', 'felt', 'become', + 'became', 'leave', 'left', 'put', 'keep', 'kept', 'let', 'begin', 'began', + 'seem', 'seemed', 'help', 'helped', 'show', 'showed', 'hear', 'heard', 'play', + 'played', 'run', 'ran', 'move', 'moved', 'live', 'lived', 'believe', 'believed', + 'hold', 'held', 'bring', 'brought', 'happen', 'happened', 'write', 'wrote', + 'sit', 'sat', 'stand', 'stood', 'lose', 'lost', 'pay', 'paid', 'meet', 'met', + 'include', 'included', 'continue', 'continued', 'set', 'learn', 'learned', + 'change', 'changed', 'lead', 'led', 'understand', 'understood', 'watch', 'watched', + 'follow', 'followed', 'stop', 'stopped', 'create', 'created', 'speak', 'spoke', + 'read', 'remember', 'remembered', 'consider', 'considered', 'appear', 'appeared', + 'buy', 'bought', 'wait', 'waited', 'serve', 'served', 'die', 'died', 'send', + 'sent', 'expect', 'expected', 'build', 'built', 'stay', 'stayed', 'fall', 'fell', + 'cut', 'reach', 'reached', 'kill', 'killed', 'remain', 'remained', 'suggest', + 'suggested', 'raise', 'raised', 'pass', 'passed', 'sell', 'sold', 'require', + 'required', 'report', 'reported', 'decide', 'decided', 'pull', 'pulled' + ]) + } - async getAllIndexedFilePaths(embeddingModel: EmbeddingModel): Promise { - if (!this.db) { - throw new DatabaseNotInitializedException() - } - const tableName = this.getTableName(embeddingModel) - const result = await this.db.query<{ path: string }>( - `SELECT DISTINCT path FROM "${tableName}"` - ) - return result.rows.map((row: { path: string }) => row.path) - } - - async getMaxMtime(embeddingModel: EmbeddingModel): Promise { - if (!this.db) { - throw new DatabaseNotInitializedException() - } - const tableName = this.getTableName(embeddingModel) - const result = await this.db.query<{ max_mtime: number | null }>( - `SELECT MAX(mtime) as max_mtime FROM "${tableName}"` - ) - return result.rows[0]?.max_mtime || null - } - - async getVectorsByFilePath( - filePath: string, - embeddingModel: EmbeddingModel, - ): Promise { - if (!this.db) { - throw new DatabaseNotInitializedException() - } - const tableName = this.getTableName(embeddingModel) - const result = await this.db.query( - `SELECT * FROM "${tableName}" WHERE path = $1`, - [filePath] - ) - return result.rows - } - - async deleteVectorsForSingleFile( - filePath: string, - embeddingModel: EmbeddingModel, - ): Promise { - if (!this.db) { - throw new DatabaseNotInitializedException() - } - const tableName = this.getTableName(embeddingModel) - await this.db.query( - `DELETE FROM "${tableName}" WHERE path = $1`, - [filePath] - ) - } - - async deleteVectorsForMultipleFiles( - filePaths: string[], - embeddingModel: EmbeddingModel, - ): Promise { - if (!this.db) { - throw new DatabaseNotInitializedException() - } - const tableName = this.getTableName(embeddingModel) - await this.db.query( - `DELETE FROM "${tableName}" WHERE path = ANY($1)`, - [filePaths] - ) - } - - async clearAllVectors(embeddingModel: EmbeddingModel): Promise { - if (!this.db) { - throw new DatabaseNotInitializedException() - } - const tableName = this.getTableName(embeddingModel) - await this.db.query(`DELETE FROM "${tableName}"`) - } - - async insertVectors( - data: InsertVector[], - embeddingModel: EmbeddingModel, - ): Promise { - if (!this.db) { - throw new DatabaseNotInitializedException() - } - const tableName = this.getTableName(embeddingModel) - - // 构建批量插入的 SQL - const values = data.map((vector, index) => { - const offset = index * 5 - return `($${offset + 1}, $${offset + 2}, $${offset + 3}, $${offset + 4}, $${offset + 5})` - }).join(',') - - const params = data.flatMap(vector => [ - vector.path, - vector.mtime, - vector.content.replace(/\0/g, ''), // 清理null字节 - `[${vector.embedding.join(',')}]`, // 转换为PostgreSQL vector格式 - vector.metadata - ]) - - await this.db.query( - `INSERT INTO "${tableName}" (path, mtime, content, embedding, metadata) - VALUES ${values}`, - params - ) - } - - async performSimilaritySearch( - queryVector: number[], - embeddingModel: EmbeddingModel, - options: { - minSimilarity: number - limit: number - scope?: { - files: string[] - folders: string[] - } - }, - ): Promise< - (Omit & { - similarity: number - })[] - > { - if (!this.db) { - throw new DatabaseNotInitializedException() - } - const tableName = this.getTableName(embeddingModel) - - let scopeCondition = '' - const params: unknown[] = [`[${queryVector.join(',')}]`, options.minSimilarity, options.limit] - let paramIndex = 4 - - if (options.scope) { - const conditions: string[] = [] - - if (options.scope.files.length > 0) { - conditions.push(`path = ANY($${paramIndex})`) - params.push(options.scope.files) - paramIndex++ - } - - if (options.scope.folders.length > 0) { - const folderConditions = options.scope.folders.map((folder, idx) => { - params.push(`${folder}/%`) - return `path LIKE $${paramIndex + idx}` - }) - conditions.push(`(${folderConditions.join(' OR ')})`) - paramIndex += options.scope.folders.length - } - - if (conditions.length > 0) { - scopeCondition = `AND (${conditions.join(' OR ')})` - } + private getTableName(embeddingModel: EmbeddingModel): string { + const tableDefinition = vectorTables[embeddingModel.dimension] + if (!tableDefinition) { + throw new Error(`No table definition found for model: ${embeddingModel.id}`) } - - const query = ` + return tableDefinition.name + } + + async getAllIndexedFilePaths(embeddingModel: EmbeddingModel): Promise { + if (!this.db) { + throw new DatabaseNotInitializedException() + } + const tableName = this.getTableName(embeddingModel) + const result = await this.db.query<{ path: string }>( + `SELECT DISTINCT path FROM "${tableName}"` + ) + return result.rows.map((row: { path: string }) => row.path) + } + + async getMaxMtime(embeddingModel: EmbeddingModel): Promise { + if (!this.db) { + throw new DatabaseNotInitializedException() + } + const tableName = this.getTableName(embeddingModel) + const result = await this.db.query<{ max_mtime: number | null }>( + `SELECT MAX(mtime) as max_mtime FROM "${tableName}"` + ) + return result.rows[0]?.max_mtime || null + } + + async getVectorsByFilePath( + filePath: string, + embeddingModel: EmbeddingModel, + ): Promise { + if (!this.db) { + throw new DatabaseNotInitializedException() + } + const tableName = this.getTableName(embeddingModel) + const result = await this.db.query( + `SELECT * FROM "${tableName}" WHERE path = $1`, + [filePath] + ) + return result.rows + } + + async deleteVectorsForSingleFile( + filePath: string, + embeddingModel: EmbeddingModel, + ): Promise { + if (!this.db) { + throw new DatabaseNotInitializedException() + } + const tableName = this.getTableName(embeddingModel) + await this.db.query( + `DELETE FROM "${tableName}" WHERE path = $1`, + [filePath] + ) + } + + async deleteVectorsForMultipleFiles( + filePaths: string[], + embeddingModel: EmbeddingModel, + ): Promise { + if (!this.db) { + throw new DatabaseNotInitializedException() + } + const tableName = this.getTableName(embeddingModel) + await this.db.query( + `DELETE FROM "${tableName}" WHERE path = ANY($1)`, + [filePaths] + ) + } + + async clearAllVectors(embeddingModel: EmbeddingModel): Promise { + if (!this.db) { + throw new DatabaseNotInitializedException() + } + const tableName = this.getTableName(embeddingModel) + await this.db.query(`DELETE FROM "${tableName}"`) + } + + async insertVectors( + data: InsertVector[], + embeddingModel: EmbeddingModel, + ): Promise { + if (!this.db) { + throw new DatabaseNotInitializedException() + } + const tableName = this.getTableName(embeddingModel) + + // 构建批量插入的 SQL + const values = data.map((vector, index) => { + const offset = index * 5 + return `($${offset + 1}, $${offset + 2}, $${offset + 3}, $${offset + 4}, $${offset + 5})` + }).join(',') + + const params = data.flatMap(vector => [ + vector.path, + vector.mtime, + vector.content.replace(/\0/g, ''), // 清理null字节 + `[${vector.embedding.join(',')}]`, // 转换为PostgreSQL vector格式 + vector.metadata + ]) + + await this.db.query( + `INSERT INTO "${tableName}" (path, mtime, content, embedding, metadata) + VALUES ${values}`, + params + ) + } + + async performSimilaritySearch( + queryVector: number[], + embeddingModel: EmbeddingModel, + options: { + minSimilarity: number + limit: number + scope?: { + files: string[] + folders: string[] + } + }, + ): Promise< + (Omit & { + similarity: number + })[] + > { + if (!this.db) { + throw new DatabaseNotInitializedException() + } + const tableName = this.getTableName(embeddingModel) + + let scopeCondition = '' + const params: unknown[] = [`[${queryVector.join(',')}]`, options.minSimilarity, options.limit] + let paramIndex = 4 + + if (options.scope) { + const conditions: string[] = [] + + if (options.scope.files.length > 0) { + conditions.push(`path = ANY($${paramIndex})`) + params.push(options.scope.files) + paramIndex++ + } + + if (options.scope.folders.length > 0) { + const folderConditions = options.scope.folders.map((folder, idx) => { + params.push(`${folder}/%`) + return `path LIKE $${paramIndex + idx}` + }) + conditions.push(`(${folderConditions.join(' OR ')})`) + paramIndex += options.scope.folders.length + } + + if (conditions.length > 0) { + scopeCondition = `AND (${conditions.join(' OR ')})` + } + } + + const query = ` SELECT id, path, mtime, content, metadata, 1 - (embedding <=> $1::vector) as similarity @@ -184,54 +218,215 @@ export class VectorRepository { LIMIT $3 ` - type SearchResult = Omit & { similarity: number } - const result = await this.db.query(query, params) - return result.rows + type SearchResult = Omit & { similarity: number } + const result = await this.db.query(query, params) + console.log("performSimilaritySearch result", result.rows) + return result.rows + } + + async performFulltextSearch( + searchQuery: string, + embeddingModel: EmbeddingModel, + options: { + limit: number + scope?: { + files: string[] + folders: string[] + } + language?: string + }, + ): Promise< + (Omit & { + rank: number + })[] + > { + if (!this.db) { + throw new DatabaseNotInitializedException() + } + + // handle query processing with segmentation and stop words filtering + const processedQuery = this.createFtsQuery(searchQuery, options.language || 'english') + + const tableName = this.getTableName(embeddingModel) + const language = options.language || 'english' + + let scopeCondition = '' + const params: unknown[] = [processedQuery, options.limit] + let paramIndex = 3 + + if (options.scope) { + const conditions: string[] = [] + + if (options.scope.files.length > 0) { + conditions.push(`path = ANY($${paramIndex})`) + params.push(options.scope.files) + paramIndex++ + } + + if (options.scope.folders.length > 0) { + const folderConditions = options.scope.folders.map((folder, idx) => { + params.push(`${folder}/%`) + return `path LIKE $${paramIndex + idx}` + }) + conditions.push(`(${folderConditions.join(' OR ')})`) + paramIndex += options.scope.folders.length + } + + if (conditions.length > 0) { + scopeCondition = `AND (${conditions.join(' OR ')})` + } + } + + const query = ` + SELECT + id, path, mtime, content, metadata, + ts_rank_cd( + COALESCE(content_tsv, to_tsvector('${language}', coalesce(content, ''))), + to_tsquery('${language}', $1) + ) AS rank + FROM "${tableName}" + WHERE ( + content_tsv @@ to_tsquery('${language}', $1) + OR (content_tsv IS NULL AND to_tsvector('${language}', coalesce(content, '')) @@ to_tsquery('${language}', $1)) + ) + ${scopeCondition} + ORDER BY rank DESC + LIMIT $2 + ` + console.log("performFulltextSearch query", query) + type SearchResult = Omit & { rank: number } + const result = await this.db.query(query, params) + console.log("performFulltextSearch result", result.rows) + return result.rows + } + + public segmentTextForTsvector(text: string, language: string = 'zh-CN'): string { + try { + // Use Intl.Segmenter to add spaces between words for better TSVECTOR indexing + if (typeof Intl !== 'undefined' && Intl.Segmenter) { + const segmenter = new Intl.Segmenter(language, { granularity: 'word' }) + const segments = segmenter.segment(text) + + const segmentedText = Array.from(segments) + .map(segment => segment.segment) + .join(' ') + + return segmentedText + } + + // Fallback: add spaces around Chinese characters and punctuation + return text.replace(/([一-龯])/g, ' $1 ') + .replace(/\s+/g, ' ') + .trim() + } catch (error) { + console.warn('Failed to segment text for TSVECTOR:', error) + return text + } } - async getWorkspaceStatistics( - embeddingModel: EmbeddingModel, - scope?: { - files: string[] - folders: string[] - } - ): Promise<{ - totalFiles: number - totalChunks: number - }> { - if (!this.db) { - throw new DatabaseNotInitializedException() - } - const tableName = this.getTableName(embeddingModel) + private createFtsQuery(query: string, language: string): string { + try { - let scopeCondition = '' - const params: unknown[] = [] - let paramIndex = 1 + let keywords: string[] = [] - if (scope) { - const conditions: string[] = [] + // Try to use Intl.Segmenter for word segmentation + if (typeof Intl !== 'undefined' && Intl.Segmenter) { + try { + const segmenter = new Intl.Segmenter(language, { granularity: 'word' }) + const segments = segmenter.segment(query) - if (scope.files.length > 0) { - conditions.push(`path = ANY($${paramIndex})`) - params.push(scope.files) - paramIndex++ - } + keywords = Array.from(segments) + .filter(s => s.isWordLike) + .map(s => s.segment.trim()) + .filter(word => { + // Filter out empty strings and stop words + if (!word || word.length === 0) return false + return !this.stopWords.has(word.toLowerCase()) + }) + .filter(word => { + // Keep all words with length > 0 since stop words are already filtered + return word.length > 0 + }) + } catch (segmentError) { + console.warn('Intl.Segmenter failed, falling back to simple splitting:', segmentError) + } + } - if (scope.folders.length > 0) { - const folderConditions = scope.folders.map((folder, idx) => { - params.push(`${folder}/%`) - return `path LIKE $${paramIndex + idx}` - }) - conditions.push(`(${folderConditions.join(' OR ')})`) - paramIndex += scope.folders.length - } + // Fallback to simple word splitting if Intl.Segmenter is not available or failed + if (keywords.length === 0) { + keywords = query + .split(/[\s\p{P}\p{S}]+/u) // Split by whitespace, punctuation, and symbols + .map(word => word.trim()) + .filter(word => { + if (!word || word.length === 0) return false + return !this.stopWords.has(word.toLowerCase()) + }) + .filter(word => { + // Keep all words with length > 0 since stop words are already filtered + return word.length > 0 + }) + } - if (conditions.length > 0) { - scopeCondition = `WHERE (${conditions.join(' OR ')})` - } - } + // If no keywords remain, return original query + if (keywords.length === 0) { + return query + } - const query = ` + // Join keywords with & for PostgreSQL full-text search + const ftsQueryString = keywords.join(' | ') + + console.log(`Original query: "${query}" -> Processed query: "${ftsQueryString}"`) + return ftsQueryString + } catch (error) { + // If all processing fails, return original query + console.warn('Failed to process FTS query:', error) + return query + } + } + + async getWorkspaceStatistics( + embeddingModel: EmbeddingModel, + scope?: { + files: string[] + folders: string[] + } + ): Promise<{ + totalFiles: number + totalChunks: number + }> { + if (!this.db) { + throw new DatabaseNotInitializedException() + } + const tableName = this.getTableName(embeddingModel) + + let scopeCondition = '' + const params: unknown[] = [] + let paramIndex = 1 + + if (scope) { + const conditions: string[] = [] + + if (scope.files.length > 0) { + conditions.push(`path = ANY($${paramIndex})`) + params.push(scope.files) + paramIndex++ + } + + if (scope.folders.length > 0) { + const folderConditions = scope.folders.map((folder, idx) => { + params.push(`${folder}/%`) + return `path LIKE $${paramIndex + idx}` + }) + conditions.push(`(${folderConditions.join(' OR ')})`) + paramIndex += scope.folders.length + } + + if (conditions.length > 0) { + scopeCondition = `WHERE (${conditions.join(' OR ')})` + } + } + + const query = ` SELECT COUNT(DISTINCT path) as total_files, COUNT(*) as total_chunks @@ -239,43 +434,43 @@ export class VectorRepository { ${scopeCondition} ` - const result = await this.db.query<{ - total_files: number - total_chunks: number - }>(query, params) + const result = await this.db.query<{ + total_files: number + total_chunks: number + }>(query, params) - const row = result.rows[0] - return { - totalFiles: Number(row?.total_files || 0), - totalChunks: Number(row?.total_chunks || 0) - } - } + const row = result.rows[0] + return { + totalFiles: Number(row?.total_files || 0), + totalChunks: Number(row?.total_chunks || 0) + } + } - async getVaultStatistics(embeddingModel: EmbeddingModel): Promise<{ - totalFiles: number - totalChunks: number - }> { - if (!this.db) { - throw new DatabaseNotInitializedException() - } - const tableName = this.getTableName(embeddingModel) + async getVaultStatistics(embeddingModel: EmbeddingModel): Promise<{ + totalFiles: number + totalChunks: number + }> { + if (!this.db) { + throw new DatabaseNotInitializedException() + } + const tableName = this.getTableName(embeddingModel) - const query = ` + const query = ` SELECT COUNT(DISTINCT path) as total_files, COUNT(*) as total_chunks FROM "${tableName}" ` - const result = await this.db.query<{ - total_files: number - total_chunks: number - }>(query) + const result = await this.db.query<{ + total_files: number + total_chunks: number + }>(query) - const row = result.rows[0] - return { - totalFiles: Number(row?.total_files || 0), - totalChunks: Number(row?.total_chunks || 0) - } - } + const row = result.rows[0] + return { + totalFiles: Number(row?.total_files || 0), + totalChunks: Number(row?.total_chunks || 0) + } + } } diff --git a/src/database/sql.ts b/src/database/sql.ts index b088463..af234bb 100644 --- a/src/database/sql.ts +++ b/src/database/sql.ts @@ -261,5 +261,108 @@ export const migrations: Record = { ALTER TABLE "source_insight_512" ADD COLUMN IF NOT EXISTS "source_mtime" bigint NOT NULL DEFAULT 0; ALTER TABLE "source_insight_384" ADD COLUMN IF NOT EXISTS "source_mtime" bigint NOT NULL DEFAULT 0; ` + }, + full_text_search: { + description: "Adds full-text search capabilities to embedding and source insight tables", + sql: ` + -- Add content_tsv columns to embedding tables + ALTER TABLE "embeddings_1536" ADD COLUMN IF NOT EXISTS "content_tsv" TSVECTOR; + ALTER TABLE "embeddings_1024" ADD COLUMN IF NOT EXISTS "content_tsv" TSVECTOR; + ALTER TABLE "embeddings_768" ADD COLUMN IF NOT EXISTS "content_tsv" TSVECTOR; + ALTER TABLE "embeddings_512" ADD COLUMN IF NOT EXISTS "content_tsv" TSVECTOR; + ALTER TABLE "embeddings_384" ADD COLUMN IF NOT EXISTS "content_tsv" TSVECTOR; + + -- Add insight_tsv columns to source insight tables + ALTER TABLE "source_insight_1536" ADD COLUMN IF NOT EXISTS "insight_tsv" TSVECTOR; + ALTER TABLE "source_insight_1024" ADD COLUMN IF NOT EXISTS "insight_tsv" TSVECTOR; + ALTER TABLE "source_insight_768" ADD COLUMN IF NOT EXISTS "insight_tsv" TSVECTOR; + ALTER TABLE "source_insight_512" ADD COLUMN IF NOT EXISTS "insight_tsv" TSVECTOR; + ALTER TABLE "source_insight_384" ADD COLUMN IF NOT EXISTS "insight_tsv" TSVECTOR; + + -- Create trigger function for embeddings tables + CREATE OR REPLACE FUNCTION embeddings_tsv_trigger() RETURNS trigger AS $$ + BEGIN + NEW.content_tsv := to_tsvector('english', coalesce(NEW.content, '')); + RETURN NEW; + END + $$ LANGUAGE plpgsql; + + -- Create trigger function for source insight tables + CREATE OR REPLACE FUNCTION source_insight_tsv_trigger() RETURNS trigger AS $$ + BEGIN + NEW.insight_tsv := to_tsvector('english', coalesce(NEW.insight, '')); + RETURN NEW; + END + $$ LANGUAGE plpgsql; + + -- Create triggers for embeddings tables (drop if exists first) + DROP TRIGGER IF EXISTS tsvector_update_embeddings_1536 ON "embeddings_1536"; + CREATE TRIGGER tsvector_update_embeddings_1536 + BEFORE INSERT OR UPDATE ON "embeddings_1536" + FOR EACH ROW EXECUTE FUNCTION embeddings_tsv_trigger(); + + DROP TRIGGER IF EXISTS tsvector_update_embeddings_1024 ON "embeddings_1024"; + CREATE TRIGGER tsvector_update_embeddings_1024 + BEFORE INSERT OR UPDATE ON "embeddings_1024" + FOR EACH ROW EXECUTE FUNCTION embeddings_tsv_trigger(); + + DROP TRIGGER IF EXISTS tsvector_update_embeddings_768 ON "embeddings_768"; + CREATE TRIGGER tsvector_update_embeddings_768 + BEFORE INSERT OR UPDATE ON "embeddings_768" + FOR EACH ROW EXECUTE FUNCTION embeddings_tsv_trigger(); + + DROP TRIGGER IF EXISTS tsvector_update_embeddings_512 ON "embeddings_512"; + CREATE TRIGGER tsvector_update_embeddings_512 + BEFORE INSERT OR UPDATE ON "embeddings_512" + FOR EACH ROW EXECUTE FUNCTION embeddings_tsv_trigger(); + + DROP TRIGGER IF EXISTS tsvector_update_embeddings_384 ON "embeddings_384"; + CREATE TRIGGER tsvector_update_embeddings_384 + BEFORE INSERT OR UPDATE ON "embeddings_384" + FOR EACH ROW EXECUTE FUNCTION embeddings_tsv_trigger(); + + -- Create triggers for source insight tables (drop if exists first) + DROP TRIGGER IF EXISTS tsvector_update_source_insight_1536 ON "source_insight_1536"; + CREATE TRIGGER tsvector_update_source_insight_1536 + BEFORE INSERT OR UPDATE ON "source_insight_1536" + FOR EACH ROW EXECUTE FUNCTION source_insight_tsv_trigger(); + + DROP TRIGGER IF EXISTS tsvector_update_source_insight_1024 ON "source_insight_1024"; + CREATE TRIGGER tsvector_update_source_insight_1024 + BEFORE INSERT OR UPDATE ON "source_insight_1024" + FOR EACH ROW EXECUTE FUNCTION source_insight_tsv_trigger(); + + DROP TRIGGER IF EXISTS tsvector_update_source_insight_768 ON "source_insight_768"; + CREATE TRIGGER tsvector_update_source_insight_768 + BEFORE INSERT OR UPDATE ON "source_insight_768" + FOR EACH ROW EXECUTE FUNCTION source_insight_tsv_trigger(); + + DROP TRIGGER IF EXISTS tsvector_update_source_insight_512 ON "source_insight_512"; + CREATE TRIGGER tsvector_update_source_insight_512 + BEFORE INSERT OR UPDATE ON "source_insight_512" + FOR EACH ROW EXECUTE FUNCTION source_insight_tsv_trigger(); + + DROP TRIGGER IF EXISTS tsvector_update_source_insight_384 ON "source_insight_384"; + CREATE TRIGGER tsvector_update_source_insight_384 + BEFORE INSERT OR UPDATE ON "source_insight_384" + FOR EACH ROW EXECUTE FUNCTION source_insight_tsv_trigger(); + + -- Note: 现有数据的 tsvector 字段将保持为 NULL,只有新插入的数据会通过 trigger 自动填充 + -- 这样可以避免大量 UPDATE 操作导致的文件句柄耗尽问题 + + -- Create GIN indexes for full-text search on embeddings tables + CREATE INDEX IF NOT EXISTS "embeddings_content_tsv_idx_1536" ON "embeddings_1536" USING GIN(content_tsv); + CREATE INDEX IF NOT EXISTS "embeddings_content_tsv_idx_1024" ON "embeddings_1024" USING GIN(content_tsv); + CREATE INDEX IF NOT EXISTS "embeddings_content_tsv_idx_768" ON "embeddings_768" USING GIN(content_tsv); + CREATE INDEX IF NOT EXISTS "embeddings_content_tsv_idx_512" ON "embeddings_512" USING GIN(content_tsv); + CREATE INDEX IF NOT EXISTS "embeddings_content_tsv_idx_384" ON "embeddings_384" USING GIN(content_tsv); + + -- Create GIN indexes for full-text search on source insight tables + CREATE INDEX IF NOT EXISTS "source_insight_tsv_idx_1536" ON "source_insight_1536" USING GIN(insight_tsv); + CREATE INDEX IF NOT EXISTS "source_insight_tsv_idx_1024" ON "source_insight_1024" USING GIN(insight_tsv); + CREATE INDEX IF NOT EXISTS "source_insight_tsv_idx_768" ON "source_insight_768" USING GIN(insight_tsv); + CREATE INDEX IF NOT EXISTS "source_insight_tsv_idx_512" ON "source_insight_512" USING GIN(insight_tsv); + CREATE INDEX IF NOT EXISTS "source_insight_tsv_idx_384" ON "source_insight_384" USING GIN(insight_tsv); + ` } }; diff --git a/src/pgworker/pglite.worker.ts b/src/pgworker/pglite.worker.ts index cc8a587..8479d74 100644 --- a/src/pgworker/pglite.worker.ts +++ b/src/pgworker/pglite.worker.ts @@ -78,6 +78,7 @@ worker({ // Execute SQL migrations for (const [_key, migration] of Object.entries(migrations)) { // Split SQL into individual commands and execute them one by one + console.log("migration: ", migration.description) const commands = migration.sql.split('\n\n').filter(cmd => cmd.trim()); for (const command of commands) { await db.exec(command);