perf: chunk filter

This commit is contained in:
archer 2023-08-07 10:59:31 +08:00
parent 1964640d5c
commit 7fe20ef041
No known key found for this signature in database
GPG Key ID: 569A5660D2379E28
5 changed files with 47 additions and 51 deletions

View File

@ -118,9 +118,9 @@ const Navbar = ({ unread }: { unread: number }) => {
} }
: { : {
color: 'myGray.500', color: 'myGray.500',
backgroundColor: 'transparent' backgroundColor: 'transparent',
onClick: () => router.push(item.link)
})} })}
onClick={() => router.push(item.link)}
> >
<MyIcon <MyIcon
name={ name={

View File

@ -258,7 +258,9 @@ const ChunkImport = ({ kbId }: { kbId: string }) => {
<Box> <Box>
<MyTooltip <MyTooltip
label={'基于 Gpt3.5 的 Token 计算方法进行分段。前后段落会有 30% 的内容重叠。'} label={
'按结束标点符号进行分段。前后段落会有 30% 的内容重叠。\n中文文档建议不要超过800英文不要超过1500'
}
forceShow forceShow
> >
<QuestionOutlineIcon ml={1} /> <QuestionOutlineIcon ml={1} />
@ -269,7 +271,7 @@ const ChunkImport = ({ kbId }: { kbId: string }) => {
flex={1} flex={1}
defaultValue={chunkLen} defaultValue={chunkLen}
min={300} min={300}
max={1000} max={2000}
step={10} step={10}
onChange={(e) => { onChange={(e) => {
setChunkLen(+e); setChunkLen(+e);
@ -294,10 +296,7 @@ const ChunkImport = ({ kbId }: { kbId: string }) => {
<QuestionOutlineIcon ml={1} /> <QuestionOutlineIcon ml={1} />
</MyTooltip> </MyTooltip>
</Box> </Box>
<Box ml={4}> <Box ml={4}>{price}</Box>
{}
{price}
</Box>
</Flex> </Flex>
<Flex mt={3}> <Flex mt={3}>
{showRePreview && ( {showRePreview && (

View File

@ -1,18 +1,5 @@
import React, { useState, useCallback, useMemo } from 'react'; import React, { useState, useCallback, useMemo } from 'react';
import { import { Box, Flex, Button, useTheme, Image, Input } from '@chakra-ui/react';
Box,
Flex,
Button,
useTheme,
NumberInput,
NumberInputField,
NumberInputStepper,
NumberIncrementStepper,
NumberDecrementStepper,
Image,
Textarea,
Input
} from '@chakra-ui/react';
import { useToast } from '@/hooks/useToast'; import { useToast } from '@/hooks/useToast';
import { useConfirm } from '@/hooks/useConfirm'; import { useConfirm } from '@/hooks/useConfirm';
import { readTxtContent, readPdfContent, readDocContent } from '@/utils/file'; import { readTxtContent, readPdfContent, readDocContent } from '@/utils/file';
@ -48,7 +35,7 @@ type FileItemType = {
const QAImport = ({ kbId }: { kbId: string }) => { const QAImport = ({ kbId }: { kbId: string }) => {
const model = qaModelList[0]?.model; const model = qaModelList[0]?.model;
const unitPrice = qaModelList[0]?.price || 3; const unitPrice = qaModelList[0]?.price || 3;
const chunkLen = qaModelList[0].maxToken / 2; const chunkLen = qaModelList[0].maxToken * 0.45;
const theme = useTheme(); const theme = useTheme();
const router = useRouter(); const router = useRouter();
const { toast } = useToast(); const { toast } = useToast();

View File

@ -129,16 +129,26 @@ export const pushGenerateVectorBill = async ({
try { try {
// 计算价格. 至少为1 // 计算价格. 至少为1
const unitPrice = global.vectorModels.find((item) => item.model === model)?.price || 0.2; const vectorModel =
global.vectorModels.find((item) => item.model === model) || global.vectorModels[0];
const unitPrice = vectorModel.price || 0.2;
let total = unitPrice * tokenLen; let total = unitPrice * tokenLen;
total = total > 1 ? total : 1; total = total > 1 ? total : 1;
// 插入 Bill 记录 // 插入 Bill 记录
const res = await Bill.create({ const res = await Bill.create({
userId, userId,
model, model: vectorModel.model,
appName: '索引生成', appName: '索引生成',
total total,
list: [
{
moduleName: '索引生成',
amount: total,
model: vectorModel.model,
tokenLen
}
]
}); });
billId = res._id; billId = res._id;

View File

@ -2,7 +2,6 @@ import mammoth from 'mammoth';
import Papa from 'papaparse'; import Papa from 'papaparse';
import { getOpenAiEncMap } from './plugin/openai'; import { getOpenAiEncMap } from './plugin/openai';
import { getErrText } from './tools'; import { getErrText } from './tools';
import { OpenAiChatEnum } from '@/constants/model';
import { uploadImg } from '@/api/system'; import { uploadImg } from '@/api/system';
/** /**
@ -145,38 +144,39 @@ export const fileDownload = ({
/** /**
* text split into chunks * text split into chunks
* maxLen - one chunk len. max: 3500 * maxLen - one chunk len. max: 3500
* slideLen - The size of the before and after Text * overlapLen - The size of the before and after Text
* maxLen > slideLen * maxLen > overlapLen
*/ */
export const splitText_token = ({ text, maxLen }: { text: string; maxLen: number }) => { export const splitText_token = ({ text, maxLen }: { text: string; maxLen: number }) => {
const slideLen = Math.floor(maxLen * 0.3); const overlapLen = Math.floor(maxLen * 0.3); // Overlap length
try { try {
const enc = getOpenAiEncMap(); const splitTexts = text.split(/(?<=[。!?.!?])/g);
// filter empty text. encode sentence
const encodeText = enc.encode(text);
const chunks: string[] = []; const chunks: string[] = [];
let tokens = 0;
let startIndex = 0; let preChunk = '';
let endIndex = Math.min(startIndex + maxLen, encodeText.length); let chunk = '';
let chunkEncodeArr = encodeText.slice(startIndex, endIndex); for (let i = 0; i < splitTexts.length; i++) {
const text = splitTexts[i];
const decoder = new TextDecoder(); chunk += text;
if (chunk.length > maxLen - overlapLen) {
while (startIndex < encodeText.length) { preChunk += text;
tokens += chunkEncodeArr.length; }
chunks.push(decoder.decode(enc.decode(chunkEncodeArr))); if (chunk.length >= maxLen) {
chunks.push(chunk);
startIndex += maxLen - slideLen; chunk = preChunk;
endIndex = Math.min(startIndex + maxLen, encodeText.length); preChunk = '';
chunkEncodeArr = encodeText.slice( }
Math.min(encodeText.length - slideLen, startIndex),
endIndex
);
} }
if (chunk) {
chunks.push(chunk);
}
const enc = getOpenAiEncMap();
const encodeText = enc.encode(chunks.join(''));
const tokens = encodeText.length;
return { return {
chunks, chunks,
tokens tokens