- 新增 src/lib/document-parser.ts: 后端文档解析器 - 支持 Word 文档(.doc/.docx)解析 - 支持 Excel 文档(.xlsx)转Markdown表格 - PDF文档直接传递给Claude API原生处理 - 新增 src/utils/document-utils.ts: 前端文档工具 - 文档类型检测函数 - 文件大小验证 - Base64编码转换
187 lines
4.5 KiB
TypeScript
187 lines
4.5 KiB
TypeScript
/**
|
||
* 前端文档工具
|
||
* 用于在浏览器端检测和处理文档类型
|
||
*/
|
||
|
||
// 文档限制配置(与后端保持一致)
|
||
export const DOCUMENT_LIMITS = {
|
||
pdf: {
|
||
maxSize: 32 * 1024 * 1024, // 32MB (Claude API 限制)
|
||
maxPages: 100, // Claude API 最大页数限制
|
||
},
|
||
word: {
|
||
maxSize: 20 * 1024 * 1024, // 20MB
|
||
},
|
||
excel: {
|
||
maxSize: 20 * 1024 * 1024, // 20MB
|
||
},
|
||
};
|
||
|
||
// 支持的文档类型
|
||
export type DocumentType = 'pdf' | 'word' | 'excel' | 'unknown';
|
||
|
||
/**
|
||
* PDF 文档接口(传给后端)
|
||
*/
|
||
export interface PdfDocumentData {
|
||
name: string;
|
||
size: number;
|
||
data: string; // Base64 编码
|
||
media_type: 'application/pdf';
|
||
}
|
||
|
||
/**
|
||
* 办公文档接口(传给后端解析)
|
||
*/
|
||
export interface OfficeDocumentData {
|
||
name: string;
|
||
size: number;
|
||
data: string; // Base64 编码
|
||
type: 'word' | 'excel';
|
||
mimeType: string;
|
||
}
|
||
|
||
/**
|
||
* 检测文档类型
|
||
*/
|
||
export function detectDocumentType(file: File): DocumentType {
|
||
const mimeType = file.type.toLowerCase();
|
||
const extension = file.name.split('.').pop()?.toLowerCase() || '';
|
||
|
||
// PDF
|
||
if (mimeType === 'application/pdf' || extension === 'pdf') {
|
||
return 'pdf';
|
||
}
|
||
|
||
// Word 文档
|
||
if (
|
||
mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ||
|
||
mimeType === 'application/msword' ||
|
||
extension === 'docx' ||
|
||
extension === 'doc'
|
||
) {
|
||
return 'word';
|
||
}
|
||
|
||
// Excel 文档
|
||
if (
|
||
mimeType === 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' ||
|
||
mimeType === 'application/vnd.ms-excel' ||
|
||
extension === 'xlsx' ||
|
||
extension === 'xls'
|
||
) {
|
||
return 'excel';
|
||
}
|
||
|
||
return 'unknown';
|
||
}
|
||
|
||
/**
|
||
* 检查是否为支持的文档类型
|
||
*/
|
||
export function isSupportedDocument(file: File): boolean {
|
||
const docType = detectDocumentType(file);
|
||
return docType !== 'unknown';
|
||
}
|
||
|
||
/**
|
||
* 检查是否为 PDF 文件
|
||
*/
|
||
export function isPdfFile(file: File): boolean {
|
||
return detectDocumentType(file) === 'pdf';
|
||
}
|
||
|
||
/**
|
||
* 检查是否为 Word 文件
|
||
*/
|
||
export function isWordFile(file: File): boolean {
|
||
return detectDocumentType(file) === 'word';
|
||
}
|
||
|
||
/**
|
||
* 检查是否为 Excel 文件
|
||
*/
|
||
export function isExcelFile(file: File): boolean {
|
||
return detectDocumentType(file) === 'excel';
|
||
}
|
||
|
||
/**
|
||
* 检查是否为办公文档(Word 或 Excel)
|
||
*/
|
||
export function isOfficeDocument(file: File): boolean {
|
||
const docType = detectDocumentType(file);
|
||
return docType === 'word' || docType === 'excel';
|
||
}
|
||
|
||
/**
|
||
* 验证文档大小
|
||
*/
|
||
export function validateDocumentSize(file: File): { valid: boolean; error?: string } {
|
||
const docType = detectDocumentType(file);
|
||
|
||
switch (docType) {
|
||
case 'pdf':
|
||
if (file.size > DOCUMENT_LIMITS.pdf.maxSize) {
|
||
return {
|
||
valid: false,
|
||
error: `PDF 文件 "${file.name}" 超过 ${DOCUMENT_LIMITS.pdf.maxSize / 1024 / 1024}MB 限制`,
|
||
};
|
||
}
|
||
break;
|
||
case 'word':
|
||
if (file.size > DOCUMENT_LIMITS.word.maxSize) {
|
||
return {
|
||
valid: false,
|
||
error: `Word 文件 "${file.name}" 超过 ${DOCUMENT_LIMITS.word.maxSize / 1024 / 1024}MB 限制`,
|
||
};
|
||
}
|
||
break;
|
||
case 'excel':
|
||
if (file.size > DOCUMENT_LIMITS.excel.maxSize) {
|
||
return {
|
||
valid: false,
|
||
error: `Excel 文件 "${file.name}" 超过 ${DOCUMENT_LIMITS.excel.maxSize / 1024 / 1024}MB 限制`,
|
||
};
|
||
}
|
||
break;
|
||
}
|
||
|
||
return { valid: true };
|
||
}
|
||
|
||
/**
|
||
* 将文件转换为 Base64(不包含 data URL 前缀)
|
||
*/
|
||
export async function fileToBase64(file: File): Promise<string> {
|
||
return new Promise((resolve, reject) => {
|
||
const reader = new FileReader();
|
||
reader.onload = () => {
|
||
const result = reader.result as string;
|
||
// 移除 data:xxx;base64, 前缀,只保留 base64 数据
|
||
const base64 = result.split(',')[1];
|
||
resolve(base64);
|
||
};
|
||
reader.onerror = reject;
|
||
reader.readAsDataURL(file);
|
||
});
|
||
}
|
||
|
||
/**
|
||
* 获取文件的 MIME 类型
|
||
*/
|
||
export function getFileMimeType(file: File): string {
|
||
if (file.type) return file.type;
|
||
|
||
// 根据扩展名推断
|
||
const extension = file.name.split('.').pop()?.toLowerCase() || '';
|
||
const mimeTypes: Record<string, string> = {
|
||
pdf: 'application/pdf',
|
||
doc: 'application/msword',
|
||
docx: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||
xls: 'application/vnd.ms-excel',
|
||
xlsx: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||
};
|
||
|
||
return mimeTypes[extension] || 'application/octet-stream';
|
||
}
|