perf: chunk filter
This commit is contained in:
parent
1964640d5c
commit
7fe20ef041
@ -118,9 +118,9 @@ const Navbar = ({ unread }: { unread: number }) => {
|
|||||||
}
|
}
|
||||||
: {
|
: {
|
||||||
color: 'myGray.500',
|
color: 'myGray.500',
|
||||||
backgroundColor: 'transparent'
|
backgroundColor: 'transparent',
|
||||||
|
onClick: () => router.push(item.link)
|
||||||
})}
|
})}
|
||||||
onClick={() => router.push(item.link)}
|
|
||||||
>
|
>
|
||||||
<MyIcon
|
<MyIcon
|
||||||
name={
|
name={
|
||||||
|
|||||||
@ -258,7 +258,9 @@ const ChunkImport = ({ kbId }: { kbId: string }) => {
|
|||||||
<Box>
|
<Box>
|
||||||
段落长度
|
段落长度
|
||||||
<MyTooltip
|
<MyTooltip
|
||||||
label={'基于 Gpt3.5 的 Token 计算方法进行分段。前后段落会有 30% 的内容重叠。'}
|
label={
|
||||||
|
'按结束标点符号进行分段。前后段落会有 30% 的内容重叠。\n中文文档建议不要超过800,英文不要超过1500'
|
||||||
|
}
|
||||||
forceShow
|
forceShow
|
||||||
>
|
>
|
||||||
<QuestionOutlineIcon ml={1} />
|
<QuestionOutlineIcon ml={1} />
|
||||||
@ -269,7 +271,7 @@ const ChunkImport = ({ kbId }: { kbId: string }) => {
|
|||||||
flex={1}
|
flex={1}
|
||||||
defaultValue={chunkLen}
|
defaultValue={chunkLen}
|
||||||
min={300}
|
min={300}
|
||||||
max={1000}
|
max={2000}
|
||||||
step={10}
|
step={10}
|
||||||
onChange={(e) => {
|
onChange={(e) => {
|
||||||
setChunkLen(+e);
|
setChunkLen(+e);
|
||||||
@ -294,10 +296,7 @@ const ChunkImport = ({ kbId }: { kbId: string }) => {
|
|||||||
<QuestionOutlineIcon ml={1} />
|
<QuestionOutlineIcon ml={1} />
|
||||||
</MyTooltip>
|
</MyTooltip>
|
||||||
</Box>
|
</Box>
|
||||||
<Box ml={4}>
|
<Box ml={4}>{price}元</Box>
|
||||||
{}
|
|
||||||
{price}元
|
|
||||||
</Box>
|
|
||||||
</Flex>
|
</Flex>
|
||||||
<Flex mt={3}>
|
<Flex mt={3}>
|
||||||
{showRePreview && (
|
{showRePreview && (
|
||||||
|
|||||||
@ -1,18 +1,5 @@
|
|||||||
import React, { useState, useCallback, useMemo } from 'react';
|
import React, { useState, useCallback, useMemo } from 'react';
|
||||||
import {
|
import { Box, Flex, Button, useTheme, Image, Input } from '@chakra-ui/react';
|
||||||
Box,
|
|
||||||
Flex,
|
|
||||||
Button,
|
|
||||||
useTheme,
|
|
||||||
NumberInput,
|
|
||||||
NumberInputField,
|
|
||||||
NumberInputStepper,
|
|
||||||
NumberIncrementStepper,
|
|
||||||
NumberDecrementStepper,
|
|
||||||
Image,
|
|
||||||
Textarea,
|
|
||||||
Input
|
|
||||||
} from '@chakra-ui/react';
|
|
||||||
import { useToast } from '@/hooks/useToast';
|
import { useToast } from '@/hooks/useToast';
|
||||||
import { useConfirm } from '@/hooks/useConfirm';
|
import { useConfirm } from '@/hooks/useConfirm';
|
||||||
import { readTxtContent, readPdfContent, readDocContent } from '@/utils/file';
|
import { readTxtContent, readPdfContent, readDocContent } from '@/utils/file';
|
||||||
@ -48,7 +35,7 @@ type FileItemType = {
|
|||||||
const QAImport = ({ kbId }: { kbId: string }) => {
|
const QAImport = ({ kbId }: { kbId: string }) => {
|
||||||
const model = qaModelList[0]?.model;
|
const model = qaModelList[0]?.model;
|
||||||
const unitPrice = qaModelList[0]?.price || 3;
|
const unitPrice = qaModelList[0]?.price || 3;
|
||||||
const chunkLen = qaModelList[0].maxToken / 2;
|
const chunkLen = qaModelList[0].maxToken * 0.45;
|
||||||
const theme = useTheme();
|
const theme = useTheme();
|
||||||
const router = useRouter();
|
const router = useRouter();
|
||||||
const { toast } = useToast();
|
const { toast } = useToast();
|
||||||
|
|||||||
@ -129,16 +129,26 @@ export const pushGenerateVectorBill = async ({
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
// 计算价格. 至少为1
|
// 计算价格. 至少为1
|
||||||
const unitPrice = global.vectorModels.find((item) => item.model === model)?.price || 0.2;
|
const vectorModel =
|
||||||
|
global.vectorModels.find((item) => item.model === model) || global.vectorModels[0];
|
||||||
|
const unitPrice = vectorModel.price || 0.2;
|
||||||
let total = unitPrice * tokenLen;
|
let total = unitPrice * tokenLen;
|
||||||
total = total > 1 ? total : 1;
|
total = total > 1 ? total : 1;
|
||||||
|
|
||||||
// 插入 Bill 记录
|
// 插入 Bill 记录
|
||||||
const res = await Bill.create({
|
const res = await Bill.create({
|
||||||
userId,
|
userId,
|
||||||
model,
|
model: vectorModel.model,
|
||||||
appName: '索引生成',
|
appName: '索引生成',
|
||||||
total
|
total,
|
||||||
|
list: [
|
||||||
|
{
|
||||||
|
moduleName: '索引生成',
|
||||||
|
amount: total,
|
||||||
|
model: vectorModel.model,
|
||||||
|
tokenLen
|
||||||
|
}
|
||||||
|
]
|
||||||
});
|
});
|
||||||
billId = res._id;
|
billId = res._id;
|
||||||
|
|
||||||
|
|||||||
@ -2,7 +2,6 @@ import mammoth from 'mammoth';
|
|||||||
import Papa from 'papaparse';
|
import Papa from 'papaparse';
|
||||||
import { getOpenAiEncMap } from './plugin/openai';
|
import { getOpenAiEncMap } from './plugin/openai';
|
||||||
import { getErrText } from './tools';
|
import { getErrText } from './tools';
|
||||||
import { OpenAiChatEnum } from '@/constants/model';
|
|
||||||
import { uploadImg } from '@/api/system';
|
import { uploadImg } from '@/api/system';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -145,37 +144,38 @@ export const fileDownload = ({
|
|||||||
/**
|
/**
|
||||||
* text split into chunks
|
* text split into chunks
|
||||||
* maxLen - one chunk len. max: 3500
|
* maxLen - one chunk len. max: 3500
|
||||||
* slideLen - The size of the before and after Text
|
* overlapLen - The size of the before and after Text
|
||||||
* maxLen > slideLen
|
* maxLen > overlapLen
|
||||||
*/
|
*/
|
||||||
export const splitText_token = ({ text, maxLen }: { text: string; maxLen: number }) => {
|
export const splitText_token = ({ text, maxLen }: { text: string; maxLen: number }) => {
|
||||||
const slideLen = Math.floor(maxLen * 0.3);
|
const overlapLen = Math.floor(maxLen * 0.3); // Overlap length
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const enc = getOpenAiEncMap();
|
const splitTexts = text.split(/(?<=[。!?.!?])/g);
|
||||||
// filter empty text. encode sentence
|
|
||||||
const encodeText = enc.encode(text);
|
|
||||||
|
|
||||||
const chunks: string[] = [];
|
const chunks: string[] = [];
|
||||||
let tokens = 0;
|
|
||||||
|
|
||||||
let startIndex = 0;
|
let preChunk = '';
|
||||||
let endIndex = Math.min(startIndex + maxLen, encodeText.length);
|
let chunk = '';
|
||||||
let chunkEncodeArr = encodeText.slice(startIndex, endIndex);
|
for (let i = 0; i < splitTexts.length; i++) {
|
||||||
|
const text = splitTexts[i];
|
||||||
const decoder = new TextDecoder();
|
chunk += text;
|
||||||
|
if (chunk.length > maxLen - overlapLen) {
|
||||||
while (startIndex < encodeText.length) {
|
preChunk += text;
|
||||||
tokens += chunkEncodeArr.length;
|
|
||||||
chunks.push(decoder.decode(enc.decode(chunkEncodeArr)));
|
|
||||||
|
|
||||||
startIndex += maxLen - slideLen;
|
|
||||||
endIndex = Math.min(startIndex + maxLen, encodeText.length);
|
|
||||||
chunkEncodeArr = encodeText.slice(
|
|
||||||
Math.min(encodeText.length - slideLen, startIndex),
|
|
||||||
endIndex
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
if (chunk.length >= maxLen) {
|
||||||
|
chunks.push(chunk);
|
||||||
|
chunk = preChunk;
|
||||||
|
preChunk = '';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (chunk) {
|
||||||
|
chunks.push(chunk);
|
||||||
|
}
|
||||||
|
|
||||||
|
const enc = getOpenAiEncMap();
|
||||||
|
const encodeText = enc.encode(chunks.join(''));
|
||||||
|
const tokens = encodeText.length;
|
||||||
|
|
||||||
return {
|
return {
|
||||||
chunks,
|
chunks,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user