perf: chunk filter

2023-08-07 10:59:31 +08:00 · 2023-08-07 10:59:31 +08:00 · 7fe20ef041
commit 7fe20ef041
parent 1964640d5c
5 changed files with 47 additions and 51 deletions
--- a/client/src/components/Layout/navbar.tsx
+++ b/client/src/components/Layout/navbar.tsx
@ -118,9 +118,9 @@ const Navbar = ({ unread }: { unread: number }) => {
                }
              : {
                  color: 'myGray.500',
-                  backgroundColor: 'transparent'
+                  backgroundColor: 'transparent',
                  onClick: () => router.push(item.link)
                })}
            onClick={() => router.push(item.link)}
          >
            <MyIcon
              name={
--- a/client/src/pages/kb/detail/components/Import/Chunk.tsx
+++ b/client/src/pages/kb/detail/components/Import/Chunk.tsx
@ -258,7 +258,9 @@ const ChunkImport = ({ kbId }: { kbId: string }) => {
              <Box>
                段落长度
                <MyTooltip
-                  label={'基于 Gpt3.5 的 Token 计算方法进行分段。前后段落会有 30% 的内容重叠。'}
+                  label={
                    '按结束标点符号进行分段。前后段落会有 30% 的内容重叠。\n中文文档建议不要超过800，英文不要超过1500'
                  }
                  forceShow
                >
                  <QuestionOutlineIcon ml={1} />
@ -269,7 +271,7 @@ const ChunkImport = ({ kbId }: { kbId: string }) => {
                flex={1}
                defaultValue={chunkLen}
                min={300}
-                max={1000}
+                max={2000}
                step={10}
                onChange={(e) => {
                  setChunkLen(+e);
@ -294,10 +296,7 @@ const ChunkImport = ({ kbId }: { kbId: string }) => {
                  <QuestionOutlineIcon ml={1} />
                </MyTooltip>
              </Box>
-              <Box ml={4}>
+              <Box ml={4}>{price}元</Box>
                {}
                {price}元
              </Box>
            </Flex>
            <Flex mt={3}>
              {showRePreview && (
--- a/client/src/pages/kb/detail/components/Import/QA.tsx
+++ b/client/src/pages/kb/detail/components/Import/QA.tsx
@ -1,18 +1,5 @@
 import React, { useState, useCallback, useMemo } from 'react';
-import {
+import { Box, Flex, Button, useTheme, Image, Input } from '@chakra-ui/react';
  Box,
  Flex,
  Button,
  useTheme,
  NumberInput,
  NumberInputField,
  NumberInputStepper,
  NumberIncrementStepper,
  NumberDecrementStepper,
  Image,
  Textarea,
  Input
 } from '@chakra-ui/react';
 import { useToast } from '@/hooks/useToast';
 import { useConfirm } from '@/hooks/useConfirm';
 import { readTxtContent, readPdfContent, readDocContent } from '@/utils/file';
@ -48,7 +35,7 @@ type FileItemType = {
 const QAImport = ({ kbId }: { kbId: string }) => {
  const model = qaModelList[0]?.model;
  const unitPrice = qaModelList[0]?.price || 3;
-  const chunkLen = qaModelList[0].maxToken / 2;
+  const chunkLen = qaModelList[0].maxToken * 0.45;
  const theme = useTheme();
  const router = useRouter();
  const { toast } = useToast();
--- a/client/src/service/events/pushBill.ts
+++ b/client/src/service/events/pushBill.ts
@ -129,16 +129,26 @@ export const pushGenerateVectorBill = async ({
    try {
      // 计算价格. 至少为1
-      const unitPrice = global.vectorModels.find((item) => item.model === model)?.price || 0.2;
+      const vectorModel =
        global.vectorModels.find((item) => item.model === model) || global.vectorModels[0];
      const unitPrice = vectorModel.price || 0.2;
      let total = unitPrice * tokenLen;
      total = total > 1 ? total : 1;
      // 插入 Bill 记录
      const res = await Bill.create({
        userId,
-        model,
+        model: vectorModel.model,
        appName: '索引生成',
-        total
+        total,
        list: [
          {
            moduleName: '索引生成',
            amount: total,
            model: vectorModel.model,
            tokenLen
          }
        ]
      });
      billId = res._id;
--- a/client/src/utils/file.ts
+++ b/client/src/utils/file.ts
@ -2,7 +2,6 @@ import mammoth from 'mammoth';
 import Papa from 'papaparse';
 import { getOpenAiEncMap } from './plugin/openai';
 import { getErrText } from './tools';
 import { OpenAiChatEnum } from '@/constants/model';
 import { uploadImg } from '@/api/system';
 /**
@ -145,37 +144,38 @@ export const fileDownload = ({
 /**
 * text split into chunks
 * maxLen - one chunk len. max: 3500
- * slideLen - The size of the before and after Text
+ * overlapLen - The size of the before and after Text
- * maxLen > slideLen
+ * maxLen > overlapLen
 */
 export const splitText_token = ({ text, maxLen }: { text: string; maxLen: number }) => {
-  const slideLen = Math.floor(maxLen * 0.3);
+  const overlapLen = Math.floor(maxLen * 0.3); // Overlap length
  try {
-    const enc = getOpenAiEncMap();
+    const splitTexts = text.split(/(?<=[。！？.!?])/g);
    // filter empty text. encode sentence
    const encodeText = enc.encode(text);
    const chunks: string[] = [];
    let tokens = 0;
-    let startIndex = 0;
+    let preChunk = '';
-    let endIndex = Math.min(startIndex + maxLen, encodeText.length);
+    let chunk = '';
-    let chunkEncodeArr = encodeText.slice(startIndex, endIndex);
+    for (let i = 0; i < splitTexts.length; i++) {
-
+      const text = splitTexts[i];
-    const decoder = new TextDecoder();
+      chunk += text;
-
+      if (chunk.length > maxLen - overlapLen) {
-    while (startIndex < encodeText.length) {
+        preChunk += text;
      tokens += chunkEncodeArr.length;
      chunks.push(decoder.decode(enc.decode(chunkEncodeArr)));
      startIndex += maxLen - slideLen;
      endIndex = Math.min(startIndex + maxLen, encodeText.length);
      chunkEncodeArr = encodeText.slice(
        Math.min(encodeText.length - slideLen, startIndex),
        endIndex
      );
      }
      if (chunk.length >= maxLen) {
        chunks.push(chunk);
        chunk = preChunk;
        preChunk = '';
      }
    }
    if (chunk) {
      chunks.push(chunk);
    }
    const enc = getOpenAiEncMap();
    const encodeText = enc.encode(chunks.join(''));
    const tokens = encodeText.length;
    return {
      chunks,