feat(文档解析): 添加PDF/Word/Excel文档解析工具

- 新增 src/lib/document-parser.ts: 后端文档解析器
  - 支持 Word 文档(.doc/.docx)解析
  - 支持 Excel 文档(.xlsx)转Markdown表格
  - PDF文档直接传递给Claude API原生处理

- 新增 src/utils/document-utils.ts: 前端文档工具
  - 文档类型检测函数
  - 文件大小验证
  - Base64编码转换
This commit is contained in:
gaoziman 2025-12-22 23:22:13 +08:00
parent be03aebb09
commit fa260137ac
2 changed files with 522 additions and 0 deletions

336
src/lib/document-parser.ts Normal file
View File

@ -0,0 +1,336 @@
/**
*
* PDFWord (.doc/.docx)Excel (.xlsx)
*/
import mammoth from 'mammoth';
import * as XLSX from 'xlsx';
import WordExtractor from 'word-extractor';
// 文档限制配置
export const DOCUMENT_LIMITS = {
pdf: {
maxSize: 32 * 1024 * 1024, // 32MB (Claude API 限制)
maxPages: 100, // Claude API 最大页数限制
},
word: {
maxSize: 20 * 1024 * 1024, // 20MB
},
excel: {
maxSize: 20 * 1024 * 1024, // 20MB
maxRows: 10000, // 最大行数
maxSheets: 10, // 最大工作表数
},
};
// 支持的文档类型
export type DocumentType = 'pdf' | 'word' | 'excel' | 'unknown';
/**
*
*/
export interface DocumentFile {
name: string;
size: number;
type: DocumentType;
mimeType: string;
data: string; // Base64 编码的文件内容
}
/**
* PDF Claude API
*/
export interface PdfDocument {
name: string;
size: number;
data: string; // Base64 编码
media_type: 'application/pdf';
}
/**
*
*/
export interface ParsedDocument {
name: string;
size: number;
type: DocumentType;
content: string; // 提取的文本内容
metadata?: {
sheets?: string[]; // Excel 工作表名称
pageCount?: number; // 页数
};
}
/**
*
*/
export function detectDocumentType(file: { name: string; type: string }): DocumentType {
const mimeType = file.type.toLowerCase();
const extension = file.name.split('.').pop()?.toLowerCase() || '';
// PDF
if (mimeType === 'application/pdf' || extension === 'pdf') {
return 'pdf';
}
// Word 文档
if (
mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
mimeType === 'application/msword' ||
extension === 'docx' ||
extension === 'doc'
) {
return 'word';
}
// Excel 文档
if (
mimeType === 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' ||
mimeType === 'application/vnd.ms-excel' ||
extension === 'xlsx' ||
extension === 'xls'
) {
return 'excel';
}
return 'unknown';
}
/**
*
*/
export function isSupportedDocument(file: { name: string; type: string }): boolean {
const docType = detectDocumentType(file);
return docType !== 'unknown';
}
/**
* PDF
*/
export function isPdfFile(file: { name: string; type: string }): boolean {
return detectDocumentType(file) === 'pdf';
}
/**
* Word
*/
export function isWordFile(file: { name: string; type: string }): boolean {
return detectDocumentType(file) === 'word';
}
/**
* Excel
*/
export function isExcelFile(file: { name: string; type: string }): boolean {
return detectDocumentType(file) === 'excel';
}
/**
*
*/
export function validateDocumentSize(
file: { name: string; type: string; size: number }
): { valid: boolean; error?: string } {
const docType = detectDocumentType(file);
switch (docType) {
case 'pdf':
if (file.size > DOCUMENT_LIMITS.pdf.maxSize) {
return {
valid: false,
error: `PDF 文件 "${file.name}" 超过 ${DOCUMENT_LIMITS.pdf.maxSize / 1024 / 1024}MB 限制`,
};
}
break;
case 'word':
if (file.size > DOCUMENT_LIMITS.word.maxSize) {
return {
valid: false,
error: `Word 文件 "${file.name}" 超过 ${DOCUMENT_LIMITS.word.maxSize / 1024 / 1024}MB 限制`,
};
}
break;
case 'excel':
if (file.size > DOCUMENT_LIMITS.excel.maxSize) {
return {
valid: false,
error: `Excel 文件 "${file.name}" 超过 ${DOCUMENT_LIMITS.excel.maxSize / 1024 / 1024}MB 限制`,
};
}
break;
}
return { valid: true };
}
/**
* Base64 Buffer
*/
export function base64ToBuffer(base64: string): Buffer {
return Buffer.from(base64, 'base64');
}
/**
* Word (.doc .docx)
* - .docx: 使用 mammoth
* - .doc: 使用 word-extractor
*/
export async function parseWordDocument(base64Data: string, fileName: string): Promise<ParsedDocument> {
try {
const buffer = base64ToBuffer(base64Data);
const extension = fileName.split('.').pop()?.toLowerCase();
let content = '';
if (extension === 'doc') {
// 使用 word-extractor 处理 .doc 文件(旧版 Word 97-2003 格式)
console.log('[parseWordDocument] Using word-extractor for .doc file:', fileName);
const extractor = new WordExtractor();
const doc = await extractor.extract(buffer);
// 提取正文内容
content = doc.getBody();
// 可选:添加脚注、尾注等内容
const footnotes = doc.getFootnotes();
const endnotes = doc.getEndnotes();
if (footnotes && footnotes.trim()) {
content += '\n\n--- 脚注 ---\n' + footnotes;
}
if (endnotes && endnotes.trim()) {
content += '\n\n--- 尾注 ---\n' + endnotes;
}
console.log('[parseWordDocument] Successfully extracted .doc content, length:', content.length);
} else {
// 使用 mammoth 处理 .docx 文件Office Open XML 格式)
console.log('[parseWordDocument] Using mammoth for .docx file:', fileName);
const result = await mammoth.extractRawText({ buffer });
content = result.value;
console.log('[parseWordDocument] Successfully extracted .docx content, length:', content.length);
}
return {
name: fileName,
size: buffer.length,
type: 'word',
content,
};
} catch (error) {
console.error('[parseWordDocument] Error:', error);
const errorMsg = error instanceof Error ? error.message : '未知错误';
throw new Error(`解析 Word 文档 "${fileName}" 失败: ${errorMsg}`);
}
}
/**
* Excel (.xlsx)
* 使 xlsx Markdown
*/
export async function parseExcelDocument(base64Data: string, fileName: string): Promise<ParsedDocument> {
try {
const buffer = base64ToBuffer(base64Data);
// 读取 Excel 文件
const workbook = XLSX.read(buffer, { type: 'buffer' });
const sheets: string[] = [];
const contentParts: string[] = [];
let totalRows = 0;
// 遍历所有工作表(限制数量)
const sheetNames = workbook.SheetNames.slice(0, DOCUMENT_LIMITS.excel.maxSheets);
for (const sheetName of sheetNames) {
sheets.push(sheetName);
const worksheet = workbook.Sheets[sheetName];
// 获取工作表范围
const range = XLSX.utils.decode_range(worksheet['!ref'] || 'A1');
const rowCount = range.e.r - range.s.r + 1;
// 限制行数
const maxRow = Math.min(range.e.r, range.s.r + DOCUMENT_LIMITS.excel.maxRows - 1);
const limitedRange = {
...range,
e: { ...range.e, r: maxRow },
};
// 转换为 JSON 数据
const jsonData = XLSX.utils.sheet_to_json(worksheet, {
range: limitedRange,
header: 1,
defval: '',
}) as unknown[][];
if (jsonData.length === 0) continue;
// 转换为 Markdown 表格
let markdown = `\n### 工作表: ${sheetName}\n\n`;
if (jsonData.length > 0) {
// 表头
const headers = jsonData[0] as string[];
markdown += '| ' + headers.map(h => String(h || '').replace(/\|/g, '\\|')).join(' | ') + ' |\n';
markdown += '| ' + headers.map(() => '---').join(' | ') + ' |\n';
// 数据行
for (let i = 1; i < jsonData.length; i++) {
const row = jsonData[i] as string[];
markdown += '| ' + row.map(cell => String(cell || '').replace(/\|/g, '\\|').replace(/\n/g, ' ')).join(' | ') + ' |\n';
}
if (rowCount > DOCUMENT_LIMITS.excel.maxRows) {
markdown += `\n*(已截断,原表格共 ${rowCount} 行,仅显示前 ${DOCUMENT_LIMITS.excel.maxRows} 行)*\n`;
}
}
contentParts.push(markdown);
totalRows += jsonData.length;
}
// 如果有更多工作表未处理
if (workbook.SheetNames.length > DOCUMENT_LIMITS.excel.maxSheets) {
contentParts.push(`\n*(共 ${workbook.SheetNames.length} 个工作表,仅显示前 ${DOCUMENT_LIMITS.excel.maxSheets} 个)*\n`);
}
return {
name: fileName,
size: buffer.length,
type: 'excel',
content: contentParts.join('\n'),
metadata: {
sheets,
},
};
} catch (error) {
console.error('[parseExcelDocument] Error:', error);
throw new Error(`解析 Excel 文档失败: ${error instanceof Error ? error.message : '未知错误'}`);
}
}
/**
*
* PDF Claude API
*/
export async function parseDocument(
base64Data: string,
fileName: string,
mimeType: string
): Promise<ParsedDocument | null> {
const docType = detectDocumentType({ name: fileName, type: mimeType });
switch (docType) {
case 'word':
return parseWordDocument(base64Data, fileName);
case 'excel':
return parseExcelDocument(base64Data, fileName);
case 'pdf':
// PDF 不在这里解析,返回 null
// PDF 会直接传给 Claude API 使用原生 document 类型
return null;
default:
return null;
}
}

186
src/utils/document-utils.ts Normal file
View File

@ -0,0 +1,186 @@
/**
*
*
*/
// 文档限制配置(与后端保持一致)
export const DOCUMENT_LIMITS = {
pdf: {
maxSize: 32 * 1024 * 1024, // 32MB (Claude API 限制)
maxPages: 100, // Claude API 最大页数限制
},
word: {
maxSize: 20 * 1024 * 1024, // 20MB
},
excel: {
maxSize: 20 * 1024 * 1024, // 20MB
},
};
// 支持的文档类型
export type DocumentType = 'pdf' | 'word' | 'excel' | 'unknown';
/**
* PDF
*/
export interface PdfDocumentData {
name: string;
size: number;
data: string; // Base64 编码
media_type: 'application/pdf';
}
/**
*
*/
export interface OfficeDocumentData {
name: string;
size: number;
data: string; // Base64 编码
type: 'word' | 'excel';
mimeType: string;
}
/**
*
*/
export function detectDocumentType(file: File): DocumentType {
const mimeType = file.type.toLowerCase();
const extension = file.name.split('.').pop()?.toLowerCase() || '';
// PDF
if (mimeType === 'application/pdf' || extension === 'pdf') {
return 'pdf';
}
// Word 文档
if (
mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
mimeType === 'application/msword' ||
extension === 'docx' ||
extension === 'doc'
) {
return 'word';
}
// Excel 文档
if (
mimeType === 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' ||
mimeType === 'application/vnd.ms-excel' ||
extension === 'xlsx' ||
extension === 'xls'
) {
return 'excel';
}
return 'unknown';
}
/**
*
*/
export function isSupportedDocument(file: File): boolean {
const docType = detectDocumentType(file);
return docType !== 'unknown';
}
/**
* PDF
*/
export function isPdfFile(file: File): boolean {
return detectDocumentType(file) === 'pdf';
}
/**
* Word
*/
export function isWordFile(file: File): boolean {
return detectDocumentType(file) === 'word';
}
/**
* Excel
*/
export function isExcelFile(file: File): boolean {
return detectDocumentType(file) === 'excel';
}
/**
* Word Excel
*/
export function isOfficeDocument(file: File): boolean {
const docType = detectDocumentType(file);
return docType === 'word' || docType === 'excel';
}
/**
*
*/
export function validateDocumentSize(file: File): { valid: boolean; error?: string } {
const docType = detectDocumentType(file);
switch (docType) {
case 'pdf':
if (file.size > DOCUMENT_LIMITS.pdf.maxSize) {
return {
valid: false,
error: `PDF 文件 "${file.name}" 超过 ${DOCUMENT_LIMITS.pdf.maxSize / 1024 / 1024}MB 限制`,
};
}
break;
case 'word':
if (file.size > DOCUMENT_LIMITS.word.maxSize) {
return {
valid: false,
error: `Word 文件 "${file.name}" 超过 ${DOCUMENT_LIMITS.word.maxSize / 1024 / 1024}MB 限制`,
};
}
break;
case 'excel':
if (file.size > DOCUMENT_LIMITS.excel.maxSize) {
return {
valid: false,
error: `Excel 文件 "${file.name}" 超过 ${DOCUMENT_LIMITS.excel.maxSize / 1024 / 1024}MB 限制`,
};
}
break;
}
return { valid: true };
}
/**
* Base64 data URL
*/
export async function fileToBase64(file: File): Promise<string> {
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = () => {
const result = reader.result as string;
// 移除 data:xxx;base64, 前缀,只保留 base64 数据
const base64 = result.split(',')[1];
resolve(base64);
};
reader.onerror = reject;
reader.readAsDataURL(file);
});
}
/**
* MIME
*/
export function getFileMimeType(file: File): string {
if (file.type) return file.type;
// 根据扩展名推断
const extension = file.name.split('.').pop()?.toLowerCase() || '';
const mimeTypes: Record<string, string> = {
pdf: 'application/pdf',
doc: 'application/msword',
docx: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
xls: 'application/vnd.ms-excel',
xlsx: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
};
return mimeTypes[extension] || 'application/octet-stream';
}