fix: pdf null byte error

2025-06-13 11:11:04 +08:00 · 2025-06-13 11:11:04 +08:00 · f6728f1b82
commit f6728f1b82
parent 540226a792
4 changed files with 39 additions and 22 deletions
--- a/src/components/chat-view/ChatView.tsx
+++ b/src/components/chat-view/ChatView.tsx
@ -23,16 +23,16 @@ import { useLLM } from '../../contexts/LLMContext'
 import { useMcpHub } from '../../contexts/McpHubContext'
 import { useRAG } from '../../contexts/RAGContext'
 import { useSettings } from '../../contexts/SettingsContext'
+import { matchSearchUsingCorePlugin } from '../../core/file-search/match/coreplugin-match'
+import { matchSearchUsingOmnisearch } from '../../core/file-search/match/omnisearch-match'
+import { regexSearchUsingCorePlugin } from '../../core/file-search/regex/coreplugin-regex'
+import { regexSearchUsingRipgrep } from '../../core/file-search/regex/ripgrep-regex'
 import {
 	LLMAPIKeyInvalidException,
 	LLMAPIKeyNotSetException,
 	LLMBaseUrlNotSetException,
 	LLMModelNotSetException,
 } from '../../core/llm/exception'
-import { matchSearchUsingCorePlugin } from '../../core/file-search/match/coreplugin-match'
-import { matchSearchUsingOmnisearch } from '../../core/file-search/match/omnisearch-match'
-import { regexSearchUsingRipgrep } from '../../core/file-search/regex/ripgrep-regex'
-import { regexSearchUsingCorePlugin } from '../../core/file-search/regex/coreplugin-regex'
 import { useChatHistory } from '../../hooks/use-chat-history'
 import { useCustomModes } from '../../hooks/use-custom-mode'
 import { t } from '../../lang/helpers'
@ -50,26 +50,26 @@ import {
 	getMentionableKey,
 	serializeMentionable,
 } from '../../utils/mentionable'
-import { readTFileContent } from '../../utils/obsidian'
+import { readTFileContent, readTFileContentPdf } from '../../utils/obsidian'
 import { openSettingsModalWithError } from '../../utils/open-settings-modal'
 import { PromptGenerator, addLineNumbers } from '../../utils/prompt-generator'
 // Removed empty line above, added one below for group separation
 import { fetchUrlsContent, onEnt, webSearch } from '../../utils/web-search'

-import { ModeSelect } from './chat-input/ModeSelect' // Start of new group
+import { ModeSelect } from './chat-input/ModeSelect'; // Start of new group
 import PromptInputWithActions, { ChatUserInputRef } from './chat-input/PromptInputWithActions'
 import { editorStateToPlainText } from './chat-input/utils/editor-state-to-plain-text'
 import { ChatHistory } from './ChatHistoryView'
 import CommandsView from './CommandsView'
 import CustomModeView from './CustomModeView'
+import FileReadResults from './FileReadResults'
 import HelloInfo from './HelloInfo'
-import McpHubView from './McpHubView' // Moved after MarkdownReasoningBlock
+import MarkdownReasoningBlock from './Markdown/MarkdownReasoningBlock'
+import McpHubView from './McpHubView'; // Moved after MarkdownReasoningBlock
 import QueryProgress, { QueryProgressState } from './QueryProgress'
 import ReactMarkdown from './ReactMarkdown'
 import SimilaritySearchResults from './SimilaritySearchResults'
-import FileReadResults from './FileReadResults'
 import WebsiteReadResults from './WebsiteReadResults'
-import MarkdownReasoningBlock from './Markdown/MarkdownReasoningBlock'

 // Add an empty line here
 const getNewInputMessage = (app: App, defaultMention: string): ChatUserMessage => {
@ -581,7 +581,7 @@ const Chat = forwardRef<ChatRef, ChatProps>((props, ref) => {
 					if (!opFile) {
 						throw new Error(`File not found: ${toolArgs.filepath}`)
 					}
-					const fileContent = await readTFileContent(opFile, app.vault, app)
+					const fileContent = await readTFileContentPdf(opFile, app.vault, app)
 					const formattedContent = `[read_file for '${toolArgs.filepath}'] Result:\n${addLineNumbers(fileContent)}\n`;
 					return {
 						type: 'read_file',
--- a/src/core/file-search/match/coreplugin-match.ts
+++ b/src/core/file-search/match/coreplugin-match.ts
@ -88,4 +88,4 @@ export async function matchSearchUsingCorePlugin(
 		console.error("Error during core plugin processing:", error);
 		return "An error occurred during the search.";
 	}
-}
+}
--- a/src/utils/obsidian.ts
+++ b/src/utils/obsidian.ts
@ -8,7 +8,7 @@ export async function parsePdfContent(file: TFile, app: App): Promise<string> {
 	try {
 		// 使用 Obsidian 内置的 PDF.js
 		const pdfjsLib = await loadPdfJs()
-		
+
 		// Read PDF file as binary buffer
 		const pdfBuffer = await app.vault.readBinary(file)

@ -26,7 +26,9 @@ export async function parsePdfContent(file: TFile, app: App): Promise<string> {
 			fullText += pageText + '\n\n'
 		}

-		return fullText || '(Empty PDF content)'
+		// 清理null字节，防止PostgreSQL UTF8编码错误
+		const cleanText = (fullText || '(Empty PDF content)').replace(/\0/g, '')
+		return cleanText
 	} catch (error: any) {
 		console.error('Error parsing PDF:', error)
 		return `(Error reading PDF file: ${error?.message || 'Unknown error'})`
@ -36,27 +38,42 @@ export async function parsePdfContent(file: TFile, app: App): Promise<string> {
 export async function readTFileContent(
 	file: TFile,
 	vault: Vault,
+): Promise<string> {
+	if (file.extension != 'md') {
+		return "(Binary file, unable to display content)"
+	}
+	const content = await vault.cachedRead(file)
+	// 清理null字节，防止PostgreSQL UTF8编码错误
+	return content.replace(/\0/g, '')
+}
+
+export async function readTFileContentPdf(
+	file: TFile,
+	vault: Vault,
 	app?: App,
 ): Promise<string> {
 	if (file.extension === 'pdf') {
 		if (app) {
-			return await parsePdfContent(file, app)
+			const content = await parsePdfContent(file, app)
+			// 清理null字节，防止PostgreSQL UTF8编码错误
+			return content.replace(/\0/g, '')
 		}
 		return "(PDF file, app context required for processing)"
 	}
 	if (file.extension != 'md') {
 		return "(Binary file, unable to display content)"
 	}
-	return await vault.cachedRead(file)
+	const content = await vault.cachedRead(file)
+	// 清理null字节，防止PostgreSQL UTF8编码错误
+	return content.replace(/\0/g, '')
 }

 export async function readMultipleTFiles(
 	files: TFile[],
-	vault: Vault,
-	app?: App,
+	vault: Vault
 ): Promise<string[]> {
 	// Read files in parallel
-	const readPromises = files.map((file) => readTFileContent(file, vault, app))
+	const readPromises = files.map((file) => readTFileContent(file, vault))
 	return await Promise.all(readPromises)
 }

--- a/src/utils/prompt-generator.ts
+++ b/src/utils/prompt-generator.ts
@ -87,7 +87,7 @@ async function getFileOrFolderContent(
 			if (path.extension != 'md') {
 				return "(Binary file, unable to display content)"
 			}
-			return addLineNumbers(await readTFileContent(path, vault, app))
+			return addLineNumbers(await readTFileContent(path, vault))
 		} else if (path instanceof TFolder) {
 			const entries = path.children
 			let folderContent = ""
@ -111,7 +111,7 @@ async function getFileOrFolderContent(
 								if (entry.extension != 'md') {
 									return undefined
 								}
-								const content = addLineNumbers(await readTFileContent(entry, vault, app))
+								const content = addLineNumbers(await readTFileContent(entry, vault))
 								return `<file_content path="${entry.path}">\n${content}\n</file_content>`
 							} catch (error) {
 								return undefined
@ -883,7 +883,7 @@ ${customInstruction}
 	private async getCurrentFileMessage(
 		currentFile: TFile,
 	): Promise<RequestMessage> {
-		const fileContent = await readTFileContent(currentFile, this.app.vault, this.app)
+		const fileContent = await readTFileContent(currentFile, this.app.vault)
 		return {
 			role: 'user',
 			content: `# Inputs
@ -905,7 +905,7 @@ ${fileContent}
 			return null;
 		}

-		const fileContent = await readTFileContent(currentFile, this.app.vault, this.app);
+		const fileContent = await readTFileContent(currentFile, this.app.vault);
 		const lines = fileContent.split('\n');

 		// 计算上下文范围，并处理边界情况