diff --git a/backend/dedup_utils.py b/backend/dedup_utils.py new file mode 100644 index 0000000..290343c --- /dev/null +++ b/backend/dedup_utils.py @@ -0,0 +1,142 @@ +""" +Question Deduplication Utilities +Provides fuzzy matching algorithms to handle AI-generated variations +""" +import difflib +import re +from typing import List, Dict, Any + + +def normalize_text(text: str) -> str: + """ + Normalize text for comparison by removing extra whitespace and punctuation variations. + + Args: + text: Input text to normalize + + Returns: + Normalized text + """ + if not text: + return "" + # Convert to lowercase + text = text.lower() + # Remove extra whitespace + text = re.sub(r'\s+', ' ', text) + # Remove common punctuation variations (Chinese/English) + text = text.replace(',', ',').replace('。', '.').replace('!', '!').replace('?', '?') + text = text.replace(':', ':').replace(';', ';').replace('"', '"').replace('"', '"') + # Strip leading/trailing whitespace + return text.strip() + + +def calculate_similarity(text1: str, text2: str) -> float: + """ + Calculate similarity between two texts using multiple methods. + + Uses a combination of: + 1. SequenceMatcher for character-level similarity (70% weight) + 2. Jaccard similarity for word-level matching (30% weight) + + Args: + text1: First text + text2: Second text + + Returns: + Similarity score between 0.0 and 1.0 + """ + if not text1 or not text2: + return 0.0 + + # Normalize texts + norm_text1 = normalize_text(text1) + norm_text2 = normalize_text(text2) + + # Exact match after normalization + if norm_text1 == norm_text2: + return 1.0 + + # 1. Character-level similarity using SequenceMatcher (handles typos, minor variations) + char_similarity = difflib.SequenceMatcher(None, norm_text1, norm_text2).ratio() + + # 2. Word-level Jaccard similarity (handles word reordering, additions/deletions) + words1 = set(norm_text1.split()) + words2 = set(norm_text2.split()) + + if not words1 or not words2: + return char_similarity + + intersection = words1.intersection(words2) + union = words1.union(words2) + jaccard_similarity = len(intersection) / len(union) if union else 0.0 + + # Weighted average (character similarity matters more for exact question matching) + final_similarity = 0.7 * char_similarity + 0.3 * jaccard_similarity + + return final_similarity + + +def is_duplicate_question( + new_question: Dict[str, Any], + existing_questions: List[Dict[str, Any]], + threshold: float = 0.85 +) -> bool: + """ + Check if a question is duplicate using fuzzy matching. + + Handles AI-generated variations where the same question might have: + - Minor wording differences + - Extra/missing punctuation + - Different whitespace + - Slight paraphrasing + + Args: + new_question: Question to check (dict with 'content' key) + existing_questions: List of questions already processed + threshold: Similarity threshold (0.85 = 85% similar is considered duplicate) + + Returns: + True if duplicate found, False otherwise + """ + new_content = new_question.get('content', '') + if not new_content: + return False + + for existing_q in existing_questions: + existing_content = existing_q.get('content', '') + if not existing_content: + continue + + similarity = calculate_similarity(new_content, existing_content) + + if similarity >= threshold: + print(f"[Fuzzy Dedup] Found duplicate (similarity: {similarity:.2%})", flush=True) + print(f" New: {new_content[:60]}...", flush=True) + print(f" Existing: {existing_content[:60]}...", flush=True) + return True + + return False + + +def deduplicate_questions( + questions: List[Dict[str, Any]], + threshold: float = 0.85 +) -> List[Dict[str, Any]]: + """ + Remove duplicate questions from a list using fuzzy matching. + + Args: + questions: List of questions to deduplicate + threshold: Similarity threshold for fuzzy matching + + Returns: + List of unique questions + """ + unique_questions = [] + + for q in questions: + if not is_duplicate_question(q, unique_questions, threshold): + unique_questions.append(q) + + print(f"[Dedup] Reduced from {len(questions)} to {len(unique_questions)} questions") + return unique_questions diff --git a/backend/routers/exam.py b/backend/routers/exam.py index 33ae1ec..47c8c9e 100644 --- a/backend/routers/exam.py +++ b/backend/routers/exam.py @@ -20,6 +20,7 @@ from services.document_parser import document_parser from services.llm_service import LLMService from services.config_service import load_llm_config from utils import is_allowed_file, calculate_content_hash +from dedup_utils import is_duplicate_question router = APIRouter() @@ -154,7 +155,11 @@ async def process_questions_with_dedup( llm_service=None ) -> ParseResult: """ - Process parsed questions with deduplication logic. + Process parsed questions with fuzzy deduplication logic. + + Uses a two-stage deduplication strategy: + 1. Fast exact hash matching (for 100% identical questions) + 2. Fuzzy similarity matching (for AI-generated variations) Args: exam_id: Target exam ID @@ -170,17 +175,28 @@ async def process_questions_with_dedup( new_added = 0 ai_answers_generated = 0 - # Get existing content hashes for this exam + # Get existing questions for this exam (content for fuzzy matching) result = await db.execute( - select(Question.content_hash).where(Question.exam_id == exam_id) + select(Question.content, Question.content_hash).where(Question.exam_id == exam_id) ) - existing_hashes = set(row[0] for row in result.all()) + existing_questions_db = result.all() + existing_hashes = set(row[1] for row in existing_questions_db) + existing_questions = [{"content": row[0]} for row in existing_questions_db] + + print(f"[Dedup] Checking against {len(existing_questions)} existing questions in database") # Insert only new questions for q_data in questions_data: content_hash = q_data.get("content_hash") + # Stage 1: Fast exact hash matching if content_hash in existing_hashes: + duplicates_removed += 1 + print(f"[Dedup] Exact hash match - skipping", flush=True) + continue + + # Stage 2: Fuzzy similarity matching (only if hash didn't match) + if is_duplicate_question(q_data, existing_questions, threshold=0.85): duplicates_removed += 1 continue @@ -222,7 +238,8 @@ async def process_questions_with_dedup( content_hash=content_hash ) db.add(new_question) - existing_hashes.add(content_hash) # Add to set to prevent duplicates in current batch + existing_hashes.add(content_hash) # Prevent exact duplicates in current batch + existing_questions.append({"content": q_data["content"]}) # Prevent fuzzy duplicates in current batch new_added += 1 await db.commit() @@ -289,10 +306,39 @@ async def async_parse_and_save( raise Exception("Document appears to be empty or too short") print(f"[Exam {exam_id}] Text content length: {len(text_content)} chars", flush=True) - print(f"[Exam {exam_id}] Document content preview:\n{text_content[:500]}\n{'...' if len(text_content) > 500 else ''}", flush=True) - print(f"[Exam {exam_id}] Calling LLM to extract questions...", flush=True) - questions_data = await llm_service.parse_document(text_content) + # Check if document is too long and needs splitting + if len(text_content) > 5000: + print(f"[Exam {exam_id}] Document is long, splitting into chunks...", flush=True) + text_chunks = document_parser.split_text_with_overlap(text_content, chunk_size=3000, overlap=1000) + print(f"[Exam {exam_id}] Split into {len(text_chunks)} chunks", flush=True) + + all_questions = [] + + for chunk_idx, chunk in enumerate(text_chunks): + print(f"[Exam {exam_id}] Processing chunk {chunk_idx + 1}/{len(text_chunks)}...", flush=True) + try: + chunk_questions = await llm_service.parse_document(chunk) + print(f"[Exam {exam_id}] Chunk {chunk_idx + 1} extracted {len(chunk_questions)} questions", flush=True) + + # Fuzzy deduplicate across chunks + for q in chunk_questions: + # Use fuzzy matching to check for duplicates + if not is_duplicate_question(q, all_questions, threshold=0.85): + all_questions.append(q) + else: + print(f"[Exam {exam_id}] Skipped fuzzy duplicate from chunk {chunk_idx + 1}", flush=True) + + except Exception as chunk_error: + print(f"[Exam {exam_id}] Chunk {chunk_idx + 1} failed: {str(chunk_error)}", flush=True) + continue + + questions_data = all_questions + print(f"[Exam {exam_id}] Total questions after fuzzy deduplication: {len(questions_data)}", flush=True) + else: + print(f"[Exam {exam_id}] Document content preview:\n{text_content[:500]}\n{'...' if len(text_content) > 500 else ''}", flush=True) + print(f"[Exam {exam_id}] Calling LLM to extract questions...", flush=True) + questions_data = await llm_service.parse_document(text_content) except Exception as parse_error: print(f"[Exam {exam_id}] ⚠️ Parse error details: {type(parse_error).__name__}", flush=True) diff --git a/backend/services/document_parser.py b/backend/services/document_parser.py index 3a7bacc..40d84b5 100644 --- a/backend/services/document_parser.py +++ b/backend/services/document_parser.py @@ -3,7 +3,7 @@ Document Parser Service Supports: TXT, PDF, DOCX, XLSX """ import io -from typing import Optional +from typing import Optional, List import PyPDF2 from docx import Document import openpyxl @@ -40,6 +40,38 @@ class DocumentParser: except Exception as e: raise Exception(f"Failed to parse PDF: {str(e)}") + @staticmethod + def split_text_with_overlap(text: str, chunk_size: int = 3000, overlap: int = 500) -> List[str]: + """ + Split text into overlapping chunks for long documents. + + Args: + text: Full text content + chunk_size: Characters per chunk (default: 3000) + overlap: Overlapping characters between chunks (default: 500) + + Returns: + List of text chunks + """ + if len(text) <= chunk_size: + return [text] + + chunks = [] + start = 0 + + while start < len(text): + end = min(start + chunk_size, len(text)) + chunk = text[start:end] + chunks.append(chunk) + + print(f"[Text Split] Chunk {len(chunks)}: chars {start}-{end}") + + # Move to next chunk with overlap + start = end - overlap if end < len(text) else len(text) + + print(f"[Text Split] Total chunks: {len(chunks)}") + return chunks + @staticmethod async def parse_docx(file_content: bytes) -> str: """Parse DOCX file""" diff --git a/backend/services/llm_service.py b/backend/services/llm_service.py index 3595bd8..7d7a88e 100644 --- a/backend/services/llm_service.py +++ b/backend/services/llm_service.py @@ -121,7 +121,8 @@ class LLMService: **识别规则**: - 文档中可能包含中文或英文题目 - 题目可能有多种格式,请灵活识别 -- 即使格式不标准,也请尽量提取题目内容 +- **重要**:只提取完整的题目,忽略任何不完整的题目(题目被截断、缺少选项、缺少关键信息等) +- 如果题目看起来不完整(比如开头或结尾被切断),直接跳过该题目 - 如果文档只是普通文章而没有题目,请返回空数组 [] **题目类型识别** (严格使用以下4种类型之一): @@ -404,9 +405,58 @@ class LLMService: print(f"[Error] Document parsing failed: {str(e)}") raise Exception(f"Failed to parse document: {str(e)}") + def split_pdf_pages(self, pdf_bytes: bytes, pages_per_chunk: int = 4, overlap: int = 1) -> List[bytes]: + """ + Split PDF into overlapping chunks to handle long documents. + + Args: + pdf_bytes: PDF file content + pages_per_chunk: Number of pages per chunk (default: 4) + overlap: Number of overlapping pages between chunks (default: 1) + + Returns: + List of PDF chunks as bytes + """ + import PyPDF2 + import io + + pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes)) + total_pages = len(pdf_reader.pages) + + # If PDF is small, don't split + if total_pages <= pages_per_chunk: + return [pdf_bytes] + + print(f"[PDF Split] Total pages: {total_pages}, splitting into chunks of {pages_per_chunk} pages with {overlap} page overlap") + + chunks = [] + start = 0 + + while start < total_pages: + end = min(start + pages_per_chunk, total_pages) + + # Create a new PDF with pages [start, end) + pdf_writer = PyPDF2.PdfWriter() + for page_num in range(start, end): + pdf_writer.add_page(pdf_reader.pages[page_num]) + + # Write to bytes + chunk_bytes = io.BytesIO() + pdf_writer.write(chunk_bytes) + chunk_bytes.seek(0) + chunks.append(chunk_bytes.getvalue()) + + print(f"[PDF Split] Chunk {len(chunks)}: pages {start+1}-{end}") + + # Move to next chunk with overlap + start = end - overlap if end < total_pages else total_pages + + return chunks + async def parse_document_with_pdf(self, pdf_bytes: bytes, filename: str) -> List[Dict[str, Any]]: """ Parse PDF document using Gemini's native PDF understanding. + Automatically splits large PDFs into overlapping chunks. Only works with Gemini provider. Args: @@ -419,12 +469,50 @@ class LLMService: if self.provider != "gemini": raise ValueError("PDF parsing is only supported with Gemini provider") + # Split PDF into chunks + pdf_chunks = self.split_pdf_pages(pdf_bytes, pages_per_chunk=4, overlap=1) + + print(f"[Gemini PDF] Processing {len(pdf_chunks)} chunk(s) for {filename}") + + all_questions = [] + # Process each chunk with fuzzy deduplication + for chunk_idx, chunk_bytes in enumerate(pdf_chunks): + print(f"[Gemini PDF] Processing chunk {chunk_idx + 1}/{len(pdf_chunks)}") + + try: + questions = await self._parse_pdf_chunk(chunk_bytes, f"{filename}_chunk_{chunk_idx + 1}") + print(f"[Gemini PDF] Chunk {chunk_idx + 1} extracted {len(questions)} questions") + + # Fuzzy deduplicate across chunks + from dedup_utils import is_duplicate_question + + for q in questions: + if not is_duplicate_question(q, all_questions, threshold=0.85): + all_questions.append(q) + else: + print(f"[PDF Split] Skipped fuzzy duplicate from chunk {chunk_idx + 1}") + + except Exception as e: + print(f"[Gemini PDF] Chunk {chunk_idx + 1} failed: {str(e)}") + # Continue with other chunks + continue + + print(f"[Gemini PDF] Total questions extracted: {len(all_questions)} (after deduplication)") + + return all_questions + + async def _parse_pdf_chunk(self, pdf_bytes: bytes, chunk_name: str) -> List[Dict[str, Any]]: + """ + Parse a single PDF chunk. + Internal method used by parse_document_with_pdf. + """ prompt = """你是一个专业的试题解析专家。请仔细分析这个 PDF 文档,提取其中的所有试题。 **识别规则**: - PDF 中可能包含中文或英文题目、图片、表格、公式 - 题目可能有多种格式,请灵活识别 -- 即使格式不标准,也请尽量提取题目内容 +- **重要**:只提取完整的题目,忽略任何不完整的题目(题目被截断、缺少选项、缺少关键信息等) +- 如果题目看起来不完整(比如开头或结尾被切断),直接跳过该题目 - 题目内容如果包含代码或换行,请将换行符替换为\\n - 图片中的文字也要识别并提取 @@ -492,8 +580,8 @@ class LLMService: - **只返回一个 JSON 数组**,不要包含其他任何内容""" try: - print(f"[Gemini PDF] Processing PDF: {filename}", flush=True) - print(f"[Gemini PDF] File size: {len(pdf_bytes)} bytes", flush=True) + print(f"[Gemini PDF] Processing chunk: {chunk_name}", flush=True) + print(f"[Gemini PDF] Chunk size: {len(pdf_bytes)} bytes", flush=True) # Use Gemini's native PDF processing via REST API import base64 diff --git a/frontend/src/App.jsx b/frontend/src/App.jsx index db27afb..61d0e0e 100644 --- a/frontend/src/App.jsx +++ b/frontend/src/App.jsx @@ -16,6 +16,7 @@ import QuizPlayer from './pages/QuizPlayer' import MistakeList from './pages/MistakeList' // Admin Pages +import AdminPanel from './pages/AdminPanel' import AdminSettings from './pages/AdminSettings' function App() { @@ -100,6 +101,15 @@ function App() { /> {/* Admin Only Routes */} + + + + } + /> + - api.get('/admin/config'), + // Config + getConfig: () => api.get('/admin/config'), + updateConfig: (config) => api.put('/admin/config', config), - // Update system config - updateConfig: (config) => - api.put('/admin/config', config) + // Users + getUsers: (skip = 0, limit = 50, search = null) => + api.get('/admin/users', { params: { skip, limit, search } }), + createUser: (username, password, is_admin = false) => + api.post('/admin/users', { username, password, is_admin }), + updateUser: (userId, data) => + api.put(`/admin/users/${userId}`, data), + deleteUser: (userId) => + api.delete(`/admin/users/${userId}`), + + // Statistics + getStatistics: () => api.get('/admin/statistics'), + getHealth: () => api.get('/admin/health'), + + // Export + exportUsers: () => api.get('/admin/export/users', { responseType: 'blob' }), + exportStatistics: () => api.get('/admin/export/statistics', { responseType: 'blob' }) } export default api diff --git a/frontend/src/pages/AdminPanel.jsx b/frontend/src/pages/AdminPanel.jsx new file mode 100644 index 0000000..3564bc7 --- /dev/null +++ b/frontend/src/pages/AdminPanel.jsx @@ -0,0 +1,377 @@ +/** + * Admin Panel - 完整的管理员面板 + */ +import React, { useState, useEffect } from 'react' +import { useNavigate } from 'react-router-dom' +import { adminAPI } from '../api/client' +import { useAuth } from '../context/AuthContext' +import { + Users, BarChart3, Settings, Trash2, Plus, Search, + ArrowLeft, Shield, Activity, Database, Download +} from 'lucide-react' +import toast from 'react-hot-toast' + +export const AdminPanel = () => { + const { user } = useAuth() + const navigate = useNavigate() + const [activeTab, setActiveTab] = useState('stats') + + // 统计数据 + const [stats, setStats] = useState(null) + const [health, setHealth] = useState(null) + + // 用户数据 + const [users, setUsers] = useState([]) + const [usersTotal, setUsersTotal] = useState(0) + const [searchQuery, setSearchQuery] = useState('') + const [showCreateModal, setShowCreateModal] = useState(false) + const [newUser, setNewUser] = useState({ username: '', password: '', is_admin: false }) + + useEffect(() => { + loadStats() + loadHealth() + loadUsers() + }, []) + + const loadStats = async () => { + try { + const res = await adminAPI.getStatistics() + setStats(res.data) + } catch (error) { + console.error('Failed to load statistics:', error) + } + } + + const loadHealth = async () => { + try { + const res = await adminAPI.getHealth() + setHealth(res.data) + } catch (error) { + console.error('Failed to load health:', error) + } + } + + const loadUsers = async () => { + try { + const res = await adminAPI.getUsers(0, 100, searchQuery || null) + setUsers(res.data.users) + setUsersTotal(res.data.total) + } catch (error) { + console.error('Failed to load users:', error) + toast.error('加载用户列表失败') + } + } + + const handleCreateUser = async () => { + if (!newUser.username || !newUser.password) { + toast.error('请填写用户名和密码') + return + } + try { + await adminAPI.createUser(newUser.username, newUser.password, newUser.is_admin) + toast.success('用户创建成功') + setShowCreateModal(false) + setNewUser({ username: '', password: '', is_admin: false }) + loadUsers() + } catch (error) { + toast.error(error.response?.data?.detail || '创建用户失败') + } + } + + const handleDeleteUser = async (userId, username) => { + if (!confirm(`确定删除用户 ${username}?`)) return + try { + await adminAPI.deleteUser(userId) + toast.success('用户已删除') + loadUsers() + } catch (error) { + toast.error(error.response?.data?.detail || '删除失败') + } + } + + const handleExportUsers = async () => { + try { + const response = await adminAPI.exportUsers() + const url = window.URL.createObjectURL(new Blob([response.data])) + const link = document.createElement('a') + link.href = url + link.setAttribute('download', 'users.csv') + document.body.appendChild(link) + link.click() + link.remove() + toast.success('导出成功') + } catch (error) { + toast.error('导出失败') + } + } + + return ( +
+ {/* Header */} +
+
+
+
+ + +
+

管理员面板

+

{user?.username}

+
+
+ +
+
+
+ + {/* Tabs */} +
+
+ + +
+ + {/* Stats Tab */} + {activeTab === 'stats' && stats && ( +
+ {/* Overview Cards */} +
+
+
+
+

用户总数

+

{stats.users?.total || 0}

+
+ +
+
+ +
+
+
+

题库总数

+

{stats.exams?.total || 0}

+
+ +
+
+ +
+
+
+

题目总数

+

{stats.questions?.total || 0}

+
+ +
+
+ +
+
+
+

今日活跃

+

{stats.activity?.today_active_users || 0}

+
+ +
+
+
+ + {/* System Health */} + {health && ( +
+

系统状态

+
+
+ 状态 + + {health.status} + +
+
+ 数据库 + {health.system?.database_url || 'SQLite'} +
+ {health.database?.size_mb && ( +
+ 数据库大小 + {health.database.size_mb} MB +
+ )} +
+
+ )} +
+ )} + + {/* Users Tab */} + {activeTab === 'users' && ( +
+ {/* Actions */} +
+
+ + setSearchQuery(e.target.value)} + onKeyDown={(e) => e.key === 'Enter' && loadUsers()} + className="w-full pl-10 pr-4 py-2 border border-gray-300 rounded-lg focus:ring-2 focus:ring-primary-500" + /> +
+
+ + +
+
+ + {/* Users Table */} +
+ + + + + + + + + + + + + + {users.map((u) => ( + + + + + + + + + + ))} + +
ID用户名角色题库数错题数注册时间操作
{u.id}{u.username} + {u.is_admin ? ( + 管理员 + ) : ( + 普通用户 + )} + {u.exam_count || 0}{u.mistake_count || 0} + {new Date(u.created_at).toLocaleDateString()} + + +
+
+
+ )} +
+ + {/* Create User Modal */} + {showCreateModal && ( +
+
+

创建新用户

+
+
+ + setNewUser({ ...newUser, username: e.target.value })} + className="w-full px-4 py-2 border border-gray-300 rounded-lg" + /> +
+
+ + setNewUser({ ...newUser, password: e.target.value })} + className="w-full px-4 py-2 border border-gray-300 rounded-lg" + /> +
+
+ setNewUser({ ...newUser, is_admin: e.target.checked })} + className="rounded" + /> + +
+
+
+ + +
+
+
+ )} +
+ ) +} + +export default AdminPanel diff --git a/frontend/src/pages/Dashboard.jsx b/frontend/src/pages/Dashboard.jsx index 6442fa3..fa8f109 100644 --- a/frontend/src/pages/Dashboard.jsx +++ b/frontend/src/pages/Dashboard.jsx @@ -7,7 +7,7 @@ import { examAPI, mistakeAPI } from '../api/client' import { useAuth } from '../context/AuthContext' import Layout from '../components/Layout' import { - FolderOpen, XCircle, TrendingUp, BookOpen, ArrowRight, Settings + FolderOpen, XCircle, TrendingUp, BookOpen, ArrowRight, Settings, Shield } from 'lucide-react' import { getStatusColor, getStatusText, formatRelativeTime, calculateProgress } from '../utils/helpers' @@ -177,15 +177,24 @@ export const Dashboard = () => {

管理员功能

-

配置系统设置

+

用户管理、系统统计、配置设置

+
+
+ +
-
)} diff --git a/frontend/src/pages/Login.jsx b/frontend/src/pages/Login.jsx index c41d06b..b59c6f7 100644 --- a/frontend/src/pages/Login.jsx +++ b/frontend/src/pages/Login.jsx @@ -110,10 +110,6 @@ export const Login = () => { - {/* Footer */} -
-

默认管理员账号:admin / admin123

-
)