QQuiz/backend/services/document_parser.py

"""
Document Parser Service
Supports: TXT, PDF, DOCX, XLSX
"""
import io
from typing import Optional
import PyPDF2
from docx import Document
import openpyxl


class DocumentParser:
    """Parse various document formats to extract text content"""

    @staticmethod
    async def parse_txt(file_content: bytes) -> str:
        """Parse TXT file"""
        try:
            return file_content.decode('utf-8')
        except UnicodeDecodeError:
            try:
                return file_content.decode('gbk')
            except:
                return file_content.decode('utf-8', errors='ignore')

    @staticmethod
    async def parse_pdf(file_content: bytes) -> str:
        """Parse PDF file"""
        try:
            pdf_file = io.BytesIO(file_content)
            pdf_reader = PyPDF2.PdfReader(pdf_file)

            text_content = []
            for page in pdf_reader.pages:
                text = page.extract_text()
                if text:
                    text_content.append(text)

            return '\n\n'.join(text_content)
        except Exception as e:
            raise Exception(f"Failed to parse PDF: {str(e)}")

    @staticmethod
    async def parse_docx(file_content: bytes) -> str:
        """Parse DOCX file"""
        try:
            docx_file = io.BytesIO(file_content)
            doc = Document(docx_file)

            text_content = []
            for paragraph in doc.paragraphs:
                if paragraph.text.strip():
                    text_content.append(paragraph.text)

            # Also extract text from tables
            for table in doc.tables:
                for row in table.rows:
                    row_text = ' | '.join(cell.text.strip() for cell in row.cells)
                    if row_text.strip():
                        text_content.append(row_text)

            return '\n\n'.join(text_content)
        except Exception as e:
            raise Exception(f"Failed to parse DOCX: {str(e)}")

    @staticmethod
    async def parse_xlsx(file_content: bytes) -> str:
        """Parse XLSX file"""
        try:
            xlsx_file = io.BytesIO(file_content)
            workbook = openpyxl.load_workbook(xlsx_file, data_only=True)

            text_content = []
            for sheet_name in workbook.sheetnames:
                sheet = workbook[sheet_name]
                text_content.append(f"=== Sheet: {sheet_name} ===")

                for row in sheet.iter_rows(values_only=True):
                    row_text = ' | '.join(str(cell) if cell is not None else '' for cell in row)
                    if row_text.strip(' |'):
                        text_content.append(row_text)

            return '\n\n'.join(text_content)
        except Exception as e:
            raise Exception(f"Failed to parse XLSX: {str(e)}")

    @staticmethod
    async def parse_file(file_content: bytes, filename: str) -> str:
        """
        Parse file based on extension.

        Args:
            file_content: File content as bytes
            filename: Original filename

        Returns:
            Extracted text content

        Raises:
            Exception: If file format is unsupported or parsing fails
        """
        extension = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''

        parsers = {
            'txt': DocumentParser.parse_txt,
            'pdf': DocumentParser.parse_pdf,
            'docx': DocumentParser.parse_docx,
            'doc': DocumentParser.parse_docx,  # Try to parse DOC as DOCX
            'xlsx': DocumentParser.parse_xlsx,
            'xls': DocumentParser.parse_xlsx,  # Try to parse XLS as XLSX
        }

        parser = parsers.get(extension)
        if not parser:
            raise Exception(f"Unsupported file format: {extension}")

        return await parser(file_content)


# Singleton instance
document_parser = DocumentParser()