feat: 拆分文本增加滑块,增加直接分段导入方式

This commit is contained in:
archer 2023-04-23 22:36:04 +08:00
parent 2774940851
commit e0b1a78344
No known key found for this signature in database
GPG Key ID: 569A5660D2379E28
15 changed files with 317 additions and 155 deletions

View File

@ -85,8 +85,12 @@ export const postModelDataInput = (data: {
/** /**
* *
*/ */
export const postModelDataSplitData = (data: { modelId: string; text: string; prompt: string }) => export const postModelDataSplitData = (data: {
POST(`/model/data/splitData`, data); modelId: string;
chunks: string[];
prompt: string;
mode: 'qa' | 'subsection';
}) => POST(`/model/data/splitData`, data);
/** /**
* json导入数据 * json导入数据

View File

@ -0,0 +1,52 @@
import React from 'react';
import { Stack, Box, Flex, useTheme } from '@chakra-ui/react';
import type { StackProps } from '@chakra-ui/react';
// @ts-ignore
interface Props extends StackProps {
list: { label: string; value: string | number }[];
value: string | number;
onChange: (e: string | number) => void;
}
const Radio = ({ list, value, onChange, ...props }: Props) => {
return (
<Stack {...props} spacing={5} direction={'row'}>
{list.map((item) => (
<Flex
key={item.value}
alignItems={'center'}
cursor={'pointer'}
userSelect={'none'}
_before={{
content: '""',
w: '16px',
h: '16px',
mr: 1,
borderRadius: '16px',
transition: '0.2s',
...(value === item.value
? {
border: '5px solid',
borderColor: 'blue.500'
}
: {
border: '2px solid',
borderColor: 'gray.200'
})
}}
_hover={{
_before: {
borderColor: 'blue.400'
}
}}
onClick={() => onChange(item.value)}
>
{item.label}
</Flex>
))}
</Stack>
);
};
export default Radio;

View File

@ -106,16 +106,20 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
}); });
} else { } else {
// 有匹配情况下,添加知识库内容。 // 有匹配情况下,添加知识库内容。
// 系统提示词过滤,最多 2000 tokens // 系统提示词过滤,最多 3000 tokens
const systemPrompt = systemPromptFilter(formatRedisPrompt, 2000); const systemPrompt = systemPromptFilter(formatRedisPrompt, 3000);
prompts.unshift({ prompts.unshift({
obj: 'SYSTEM', obj: 'SYSTEM',
value: `${ value: `
model.systemPrompt || '根据知识库内容回答' ${model.systemPrompt}
} ,下面是知识库内容:当前时间为${dayjs().format( ${
'YYYY/MM/DD HH:mm:ss' model.search.mode === ModelVectorSearchModeEnum.hightSimilarity
)}\n${systemPrompt}` ? `你只能从知识库选择内容回答.不在知识库内容拒绝回复`
: ''
}
知识库内容为: 当前时间为${dayjs().format('YYYY/MM/DD HH:mm:ss')}\n${systemPrompt}'
`
}); });
} }

View File

@ -4,7 +4,7 @@ import { connectToDatabase, Model } from '@/service/mongo';
import { authToken } from '@/service/utils/tools'; import { authToken } from '@/service/utils/tools';
import { ModelDataSchema } from '@/types/mongoSchema'; import { ModelDataSchema } from '@/types/mongoSchema';
import { generateVector } from '@/service/events/generateVector'; import { generateVector } from '@/service/events/generateVector';
import { connectPg, PgClient } from '@/service/pg'; import { PgClient } from '@/service/pg';
export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) { export default async function handler(req: NextApiRequest, res: NextApiResponse<any>) {
try { try {
@ -26,7 +26,6 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse<
const userId = await authToken(authorization); const userId = await authToken(authorization);
await connectToDatabase(); await connectToDatabase();
const pg = await connectPg();
// 验证是否是该用户的 model // 验证是否是该用户的 model
const model = await Model.findOne({ const model = await Model.findOne({

View File

@ -2,14 +2,20 @@ import type { NextApiRequest, NextApiResponse } from 'next';
import { jsonRes } from '@/service/response'; import { jsonRes } from '@/service/response';
import { connectToDatabase, SplitData, Model } from '@/service/mongo'; import { connectToDatabase, SplitData, Model } from '@/service/mongo';
import { authToken } from '@/service/utils/tools'; import { authToken } from '@/service/utils/tools';
import { generateVector } from '@/service/events/generateVector';
import { generateQA } from '@/service/events/generateQA'; import { generateQA } from '@/service/events/generateQA';
import { encode } from 'gpt-token-utils'; import { PgClient } from '@/service/pg';
/* 拆分数据成QA */ /* 拆分数据成QA */
export default async function handler(req: NextApiRequest, res: NextApiResponse) { export default async function handler(req: NextApiRequest, res: NextApiResponse) {
try { try {
const { text, modelId, prompt } = req.body as { text: string; modelId: string; prompt: string }; const { chunks, modelId, prompt, mode } = req.body as {
if (!text || !modelId || !prompt) { modelId: string;
chunks: string[];
prompt: string;
mode: 'qa' | 'subsection';
};
if (!chunks || !modelId || !prompt) {
throw new Error('参数错误'); throw new Error('参数错误');
} }
await connectToDatabase(); await connectToDatabase();
@ -28,46 +34,31 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
throw new Error('无权操作该模型'); throw new Error('无权操作该模型');
} }
const replaceText = text.replace(/\\n/g, '\n'); if (mode === 'qa') {
// 批量QA拆分插入数据
await SplitData.create({
userId,
modelId,
textList: chunks,
prompt
});
// 文本拆分成 chunk generateQA();
const chunks = replaceText.split('\n').filter((item) => item.trim()); } else if (mode === 'subsection') {
// 插入记录
await PgClient.insert('modelData', {
values: chunks.map((item) => [
{ key: 'user_id', value: userId },
{ key: 'model_id', value: modelId },
{ key: 'q', value: item },
{ key: 'a', value: '' },
{ key: 'status', value: 'waiting' }
])
});
const textList: string[] = []; generateVector();
let splitText = '';
/* 取 2.5k ~ 3.5K tokens 内容 */
chunks.forEach((chunk) => {
const tokens = encode(splitText + chunk).length;
if (tokens >= 3500) {
// 超过 3500不要这块内容
splitText && textList.push(splitText);
splitText = chunk;
} else if (tokens >= 2500) {
// 超过 3000取内容
splitText && textList.push(splitText + chunk);
splitText = '';
} else {
//没超过 3000继续添加
splitText += chunk;
}
});
if (splitText) {
textList.push(splitText);
} }
// 批量插入数据
await SplitData.create({
userId,
modelId,
rawText: text,
textList,
prompt
});
generateQA();
jsonRes(res); jsonRes(res);
} catch (err) { } catch (err) {
jsonRes(res, { jsonRes(res, {

View File

@ -126,16 +126,20 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
}); });
} else { } else {
// 有匹配或者低匹配度模式情况下,添加知识库内容。 // 有匹配或者低匹配度模式情况下,添加知识库内容。
// 系统提示词过滤,最多 2000 tokens // 系统提示词过滤,最多 3000 tokens
const systemPrompt = systemPromptFilter(formatRedisPrompt, 2000); const systemPrompt = systemPromptFilter(formatRedisPrompt, 3000);
prompts.unshift({ prompts.unshift({
obj: 'SYSTEM', obj: 'SYSTEM',
value: `${ value: `
model.systemPrompt || '根据知识库内容回答' ${model.systemPrompt}
} ,下面是知识库内容:当前时间为${dayjs().format( ${
'YYYY/MM/DD HH:mm:ss' model.search.mode === ModelVectorSearchModeEnum.hightSimilarity
)}\n${systemPrompt}` ? `你只能从知识库选择内容回答.不在知识库内容拒绝回复`
: ''
}
知识库内容为: 当前时间为${dayjs().format('YYYY/MM/DD HH:mm:ss')}\n${systemPrompt}'
`
}); });
} }

View File

@ -133,7 +133,7 @@ const Chat = ({ modelId, chatId }: { modelId: string; chatId: string }) => {
if (isScroll && res.history.length > 0) { if (isScroll && res.history.length > 0) {
setTimeout(() => { setTimeout(() => {
scrollToBottom('auto'); scrollToBottom('auto');
}, 2000); }, 1200);
} }
} catch (e: any) { } catch (e: any) {
toast({ toast({

View File

@ -122,9 +122,9 @@ const InputDataModal = ({
<Box h={'30px'}></Box> <Box h={'30px'}></Box>
<Textarea <Textarea
placeholder={ placeholder={
'相关问题,可以输入多个问法, 最多500字。例如:\n1. laf 是什么?\n2. laf 可以做什么?\n3. laf怎么用' '相关问题,可以输入多个问法, 最多 1000 字。例如:\n1. laf 是什么?\n2. laf 可以做什么?\n3. laf怎么用'
} }
maxLength={500} maxLength={1000}
resize={'none'} resize={'none'}
h={'calc(100% - 30px)'} h={'calc(100% - 30px)'}
{...register(`q`, { {...register(`q`, {
@ -136,9 +136,9 @@ const InputDataModal = ({
<Box h={'30px'}></Box> <Box h={'30px'}></Box>
<Textarea <Textarea
placeholder={ placeholder={
'知识点,最多1000字。请保持主语的完整性缺少主语会导致效果不佳。例如:\n1. laf是一个云函数开发平台。\n2. laf 什么都能做。\n3. 下面是使用 laf 的例子: ……' '知识点,最多 2000 字。例如:\n1. laf是一个云函数开发平台。\n2. laf 什么都能做。\n3. 下面是使用 laf 的例子: ……'
} }
maxLength={1000} maxLength={2000}
resize={'none'} resize={'none'}
h={'calc(100% - 30px)'} h={'calc(100% - 30px)'}
{...register(`a`, { {...register(`a`, {

View File

@ -18,6 +18,7 @@ import {
MenuItem, MenuItem,
Input Input
} from '@chakra-ui/react'; } from '@chakra-ui/react';
import type { BoxProps } from '@chakra-ui/react';
import type { ModelSchema } from '@/types/mongoSchema'; import type { ModelSchema } from '@/types/mongoSchema';
import type { ModelDataItemType } from '@/types/model'; import type { ModelDataItemType } from '@/types/model';
import { ModelDataStatusMap } from '@/constants/model'; import { ModelDataStatusMap } from '@/constants/model';
@ -114,6 +115,14 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
} }
}); });
const tdStyles: BoxProps = {
fontSize: 'xs',
maxW: '500px',
whiteSpace: 'pre-wrap',
maxH: '250px',
overflowY: 'auto'
};
return ( return (
<> <>
<Flex> <Flex>
@ -156,8 +165,8 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
> >
</MenuItem> </MenuItem>
<MenuItem onClick={onOpenSelectFileModal}>/ QA </MenuItem> <MenuItem onClick={onOpenSelectFileModal}>/</MenuItem>
<MenuItem onClick={onOpenSelectUrlModal}> QA </MenuItem> {/* <MenuItem onClick={onOpenSelectUrlModal}>网站内容拆分</MenuItem> */}
<MenuItem onClick={onOpenSelectCsvModal}>csv </MenuItem> <MenuItem onClick={onOpenSelectCsvModal}>csv </MenuItem>
</MenuList> </MenuList>
</Menu> </Menu>
@ -191,33 +200,23 @@ const ModelDataCard = ({ model }: { model: ModelSchema }) => {
<Box mt={4}> <Box mt={4}>
<TableContainer minH={'500px'}> <TableContainer minH={'500px'}>
<Table variant={'simple'}> <Table variant={'simple'} w={'100%'}>
<Thead> <Thead>
<Tr> <Tr>
<Th>Question</Th> <Th>()</Th>
<Th>Text</Th> <Th></Th>
<Th>Status</Th> <Th></Th>
<Th></Th> <Th></Th>
</Tr> </Tr>
</Thead> </Thead>
<Tbody> <Tbody>
{modelDataList.map((item) => ( {modelDataList.map((item) => (
<Tr key={item.id}> <Tr key={item.id}>
<Td minW={'200px'}> <Td>
<Box fontSize={'xs'} whiteSpace={'pre-wrap'}> <Box {...tdStyles}>{item.q}</Box>
{item.q}
</Box>
</Td> </Td>
<Td minW={'200px'}> <Td>
<Box <Box {...tdStyles}>{item.a || '-'}</Box>
w={'100%'}
fontSize={'xs'}
whiteSpace={'pre-wrap'}
maxH={'250px'}
overflowY={'auto'}
>
{item.a}
</Box>
</Td> </Td>
<Td>{ModelDataStatusMap[item.status]}</Td> <Td>{ModelDataStatusMap[item.status]}</Td>
<Td> <Td>

View File

@ -1,4 +1,4 @@
import React, { useState, useCallback } from 'react'; import React, { useState, useCallback, useMemo } from 'react';
import { import {
Box, Box,
Flex, Flex,
@ -20,9 +20,26 @@ import { readTxtContent, readPdfContent, readDocContent } from '@/utils/file';
import { useMutation } from '@tanstack/react-query'; import { useMutation } from '@tanstack/react-query';
import { postModelDataSplitData } from '@/api/model'; import { postModelDataSplitData } from '@/api/model';
import { formatPrice } from '@/utils/user'; import { formatPrice } from '@/utils/user';
import Radio from '@/components/Radio';
import { splitText } from '@/utils/file';
const fileExtension = '.txt,.doc,.docx,.pdf,.md'; const fileExtension = '.txt,.doc,.docx,.pdf,.md';
const modeMap = {
qa: {
maxLen: 2800,
slideLen: 800,
price: 3,
isPrompt: true
},
subsection: {
maxLen: 1000,
slideLen: 300,
price: 0.4,
isPrompt: false
}
};
const SelectFileModal = ({ const SelectFileModal = ({
onClose, onClose,
onSuccess, onSuccess,
@ -36,38 +53,45 @@ const SelectFileModal = ({
const { toast } = useToast(); const { toast } = useToast();
const [prompt, setPrompt] = useState(''); const [prompt, setPrompt] = useState('');
const { File, onOpen } = useSelectFile({ fileType: fileExtension, multiple: true }); const { File, onOpen } = useSelectFile({ fileType: fileExtension, multiple: true });
const [fileText, setFileText] = useState(''); const [mode, setMode] = useState<'qa' | 'subsection'>('qa');
const [fileTextArr, setFileTextArr] = useState<string[]>(['']);
const { openConfirm, ConfirmChild } = useConfirm({ const { openConfirm, ConfirmChild } = useConfirm({
content: '确认导入该文件,需要一定时间进行拆解,该任务无法终止!如果余额不足,任务讲被终止。' content: '确认导入该文件,需要一定时间进行拆解,该任务无法终止!如果余额不足,任务讲被终止。'
}); });
const fileText = useMemo(() => {
const chunks = fileTextArr.map((item) =>
splitText({
text: item,
...modeMap[mode]
})
);
return chunks.join('');
}, [fileTextArr, mode]);
const onSelectFile = useCallback( const onSelectFile = useCallback(
async (e: File[]) => { async (e: File[]) => {
setSelecting(true); setSelecting(true);
try { try {
const fileTexts = ( const fileTexts = await Promise.all(
await Promise.all( e.map((file) => {
e.map((file) => { // @ts-ignore
// @ts-ignore const extension = file?.name?.split('.').pop().toLowerCase();
const extension = file?.name?.split('.').pop().toLowerCase(); switch (extension) {
switch (extension) { case 'txt':
case 'txt': case 'md':
case 'md': return readTxtContent(file);
return readTxtContent(file); case 'pdf':
case 'pdf': return readPdfContent(file);
return readPdfContent(file); case 'doc':
case 'doc': case 'docx':
case 'docx': return readDocContent(file);
return readDocContent(file); default:
default: return '';
return ''; }
} })
}) );
) setFileTextArr(fileTexts);
)
.join(' ')
.replace(/(\\n|\n)+/g, '\n');
setFileText(fileTexts);
} catch (error: any) { } catch (error: any) {
console.log(error); console.log(error);
toast({ toast({
@ -77,16 +101,25 @@ const SelectFileModal = ({
} }
setSelecting(false); setSelecting(false);
}, },
[setSelecting, toast] [toast]
); );
const { mutate, isLoading } = useMutation({ const { mutate, isLoading } = useMutation({
mutationFn: async () => { mutationFn: async () => {
if (!fileText) return; if (!fileText) return;
const chunks = fileTextArr
.map((item) =>
splitText({
text: item,
...modeMap[mode]
})
)
.flat();
await postModelDataSplitData({ await postModelDataSplitData({
modelId, modelId,
text: fileText.replace(/\\n/g, '\n').replace(/\n+/g, '\n'), chunks,
prompt: `下面是"${prompt || '一段长文本'}"` prompt: `下面是"${prompt || '一段长文本'}"`,
mode
}); });
toast({ toast({
title: '导入数据成功,需要一段拆解和训练', title: '导入数据成功,需要一段拆解和训练',
@ -106,58 +139,82 @@ const SelectFileModal = ({
return ( return (
<Modal isOpen={true} onClose={onClose} isCentered> <Modal isOpen={true} onClose={onClose} isCentered>
<ModalOverlay /> <ModalOverlay />
<ModalContent maxW={'min(900px, 90vw)'} m={0} position={'relative'} h={'90vh'}> <ModalContent maxW={'min(1000px, 90vw)'} m={0} position={'relative'} h={'90vh'}>
<ModalHeader></ModalHeader> <ModalHeader></ModalHeader>
<ModalCloseButton /> <ModalCloseButton />
<ModalBody <ModalBody
display={'flex'} display={'flex'}
flexDirection={'column'} flexDirection={'column'}
p={4} p={0}
h={'100%'} h={'100%'}
alignItems={'center'} alignItems={'center'}
justifyContent={'center'} justifyContent={'center'}
fontSize={'sm'} fontSize={'sm'}
> >
<Button isLoading={selecting} onClick={onOpen}> <Box mt={2} px={4} maxW={['100%']} textAlign={'justify'} color={'blackAlpha.600'}>
</Button>
<Box mt={2} maxW={['100%', '70%']}>
{fileExtension} QA {fileExtension} QA
tokens tokens {encode(fileText).length}{' '}
tokens {formatPrice(encode(fileText).length * modeMap[mode].price)}
</Box> </Box>
<Box mt={2}> {/* 拆分模式 */}
{encode(fileText).length} tokens {formatPrice(encode(fileText).length * 3)} <Flex w={'100%'} px={5} alignItems={'center'} mt={4}>
<Box flex={'0 0 70px'}>:</Box>
</Box> <Radio
<Flex w={'100%'} alignItems={'center'} my={4}> ml={3}
<Box flex={'0 0 auto'} mr={2}> list={[
{ label: 'QA拆分', value: 'qa' },
</Box> { label: '直接分段', value: 'subsection' }
<Input ]}
placeholder="提示词,例如: Laf的介绍/关于gpt4的论文/一段长文本" value={mode}
value={prompt} onChange={(e) => setMode(e as 'subsection' | 'qa')}
onChange={(e) => setPrompt(e.target.value)}
size={'sm'}
/> />
</Flex> </Flex>
<Textarea {/* 内容介绍 */}
flex={'1 0 0'} {modeMap[mode].isPrompt && (
h={0} <Flex w={'100%'} px={5} alignItems={'center'} mt={4}>
w={'100%'} <Box flex={'0 0 70px'} mr={2}>
placeholder="文件内容"
maxLength={-1} </Box>
resize={'none'} <Input
fontSize={'xs'} placeholder="提示词,例如: Laf的介绍/关于gpt4的论文/一段长文本"
whiteSpace={'pre-wrap'} value={prompt}
value={fileText} onChange={(e) => setPrompt(e.target.value)}
onChange={(e) => setFileText(e.target.value)} size={'sm'}
/> />
</Flex>
)}
{/* 文本内容 */}
<Box flex={'1 0 0'} px={5} h={0} w={'100%'} overflowY={'auto'} mt={4}>
{fileTextArr.map((item, i) => (
<Box key={i} mb={5}>
<Box mb={1}>{i + 1}</Box>
<Textarea
placeholder="文件内容"
maxLength={-1}
rows={10}
fontSize={'xs'}
whiteSpace={'pre-wrap'}
value={item}
onChange={(e) => {
setFileTextArr([
...fileTextArr.slice(0, i),
e.target.value,
...fileTextArr.slice(i + 1)
]);
}}
/>
</Box>
))}
</Box>
</ModalBody> </ModalBody>
<Flex px={6} pt={2} pb={4}> <Flex px={6} pt={2} pb={4}>
<Button isLoading={selecting} onClick={onOpen}>
</Button>
<Box flex={1}></Box> <Box flex={1}></Box>
<Button variant={'outline'} mr={3} onClick={onClose}> <Button variant={'outline'} colorScheme={'gray'} mr={3} onClick={onClose}>
</Button> </Button>
<Button isLoading={isLoading} isDisabled={fileText === ''} onClick={openConfirm(mutate)}> <Button isLoading={isLoading} isDisabled={fileText === ''} onClick={openConfirm(mutate)}>

View File

@ -44,8 +44,9 @@ const SelectUrlModal = ({
if (!webText) return; if (!webText) return;
await postModelDataSplitData({ await postModelDataSplitData({
modelId, modelId,
text: webText, chunks: [],
prompt: `下面是"${prompt || '一段长文本'}"` prompt: `下面是"${prompt || '一段长文本'}"`,
mode: 'qa'
}); });
toast({ toast({
title: '导入数据成功,需要一段拆解和训练', title: '导入数据成功,需要一段拆解和训练',
@ -89,7 +90,7 @@ const SelectUrlModal = ({
<Modal isOpen={true} onClose={onClose} isCentered> <Modal isOpen={true} onClose={onClose} isCentered>
<ModalOverlay /> <ModalOverlay />
<ModalContent maxW={'min(900px, 90vw)'} m={0} position={'relative'} h={'90vh'}> <ModalContent maxW={'min(900px, 90vw)'} m={0} position={'relative'} h={'90vh'}>
<ModalHeader></ModalHeader> <ModalHeader></ModalHeader>
<ModalCloseButton /> <ModalCloseButton />
<ModalBody <ModalBody
@ -102,7 +103,7 @@ const SelectUrlModal = ({
fontSize={'sm'} fontSize={'sm'}
> >
<Box mt={2} maxW={['100%', '70%']}> <Box mt={2} maxW={['100%', '70%']}>
QA tokens QA tokens
</Box> </Box>
<Box mt={2}> <Box mt={2}>

View File

@ -69,9 +69,13 @@ export async function generateQA(next = false): Promise<any> {
const chatAPI = getOpenAIApi(userApiKey || systemKey); const chatAPI = getOpenAIApi(userApiKey || systemKey);
const systemPrompt: ChatCompletionRequestMessage = { const systemPrompt: ChatCompletionRequestMessage = {
role: 'system', role: 'system',
content: `你是出题官.${ content: `你是出题人
dataItem.prompt || '下面是"一段长文本"' ${dataItem.prompt || '下面是"一段长文本"'}
},520,,,代码题等.答案要详细.按格式返回: Q1:\nA1:\nQ2:\nA2:\n` 520,,,代码题等.答案要详细.按格式返回: Q1:
A1:
Q2:
A2:
...`
}; };
// 请求 chatgpt 获取回答 // 请求 chatgpt 获取回答

View File

@ -18,10 +18,6 @@ const SplitDataSchema = new Schema({
ref: 'model', ref: 'model',
required: true required: true
}, },
rawText: {
type: String,
required: true
},
textList: { textList: {
type: [String], type: [String],
default: [] default: []

View File

@ -75,7 +75,6 @@ export interface ModelSplitDataSchema {
_id: string; _id: string;
userId: string; userId: string;
modelId: string; modelId: string;
rawText: string;
prompt: string; prompt: string;
errorText: string; errorText: string;
textList: string[]; textList: string[];

View File

@ -1,5 +1,6 @@
import mammoth from 'mammoth'; import mammoth from 'mammoth';
import Papa from 'papaparse'; import Papa from 'papaparse';
import { encode } from 'gpt-token-utils';
/** /**
* txt * txt
@ -137,3 +138,54 @@ export const fileDownload = ({
downloadLink.click(); downloadLink.click();
document.body.removeChild(downloadLink); document.body.removeChild(downloadLink);
}; };
/**
* text split into chunks
* maxLen - one chunk len. max: 3500
* slideLen - The size of the before and after Text
* maxLen > slideLen
*/
export const splitText = ({
text,
maxLen,
slideLen
}: {
text: string;
maxLen: number;
slideLen: number;
}) => {
const textArr =
text.match(/[!?。\n.]+|[^\s]+/g)?.filter((item) => {
const text = item.replace(/(\\n)/g, '\n').trim();
if (text && text !== '\n') return true;
return false;
}) || [];
const chunks: { sum: number; arr: string[] }[] = [{ sum: 0, arr: [] }];
for (let i = 0; i < textArr.length; i++) {
const tokenLen = encode(textArr[i]).length;
chunks[chunks.length - 1].sum += tokenLen;
chunks[chunks.length - 1].arr.push(textArr[i]);
// current length is over maxLen. create new chunk
if (chunks[chunks.length - 1].sum + tokenLen >= maxLen) {
// get slide len text as the initial value
const chunk: { sum: number; arr: string[] } = { sum: 0, arr: [] };
for (let j = chunks[chunks.length - 1].arr.length - 1; j >= 0; j--) {
const chunkText = chunks[chunks.length - 1].arr[j];
const tokenLen = encode(chunkText).length;
chunk.sum += tokenLen;
chunk.arr.unshift(chunkText);
if (chunk.sum >= slideLen) {
break;
}
}
chunks.push(chunk);
}
}
const result = chunks.map((item) => item.arr.join(''));
return result;
};