AndyC commited on
Commit
4644da6
·
1 Parent(s): 6fc4e80

added additional tests for pdf processing

Browse files
Files changed (1) hide show
  1. tests/test_media.py +248 -2
tests/test_media.py CHANGED
@@ -5,7 +5,7 @@ from PIL import Image
5
  from pathlib import Path
6
  import tempfile
7
 
8
- from app import get_frames, process_video, process_user_input, process_history
9
 
10
  # Get the project root directory
11
  ROOT_DIR = Path(__file__).parent.parent
@@ -296,4 +296,250 @@ def test_process_history_file_handling():
296
 
297
  finally:
298
  if os.path.exists(image_path):
299
- os.unlink(image_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  from pathlib import Path
6
  import tempfile
7
 
8
+ from app import get_frames, process_video, process_user_input, process_history, extract_pdf_text
9
 
10
  # Get the project root directory
11
  ROOT_DIR = Path(__file__).parent.parent
 
296
 
297
  finally:
298
  if os.path.exists(image_path):
299
+ os.unlink(image_path)
300
+
301
+
302
+ def test_extract_pdf_text_nonexistent_file():
303
+ """Test that extract_pdf_text handles non-existent files appropriately."""
304
+ with pytest.raises(ValueError, match="File not found"):
305
+ extract_pdf_text("nonexistent_file.pdf")
306
+
307
+
308
+ def test_extract_pdf_text_with_mock_pdf():
309
+ """Test PDF text extraction with a simple PDF file."""
310
+ import fitz # PyMuPDF
311
+
312
+ # Create a temporary PDF with some text content
313
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
314
+ pdf_path = temp_pdf.name
315
+
316
+ try:
317
+ # Create a simple PDF with text
318
+ doc = fitz.open() # Create new PDF
319
+ page = doc.new_page()
320
+
321
+ # Add some text to the page
322
+ text_content = "This is a test PDF document.\nIt contains multiple lines of text.\nPage 1 content here."
323
+ page.insert_text((50, 100), text_content, fontsize=12)
324
+
325
+ # Save the PDF
326
+ doc.save(pdf_path)
327
+ doc.close()
328
+
329
+ # Test the extract_pdf_text function
330
+ result = extract_pdf_text(pdf_path)
331
+
332
+ # Verify the extracted text contains our content
333
+ assert isinstance(result, str)
334
+ assert "This is a test PDF document" in result
335
+ assert "Page 1:" in result # Should include page number
336
+ assert "multiple lines of text" in result
337
+
338
+ finally:
339
+ # Clean up the temporary PDF file
340
+ if os.path.exists(pdf_path):
341
+ os.unlink(pdf_path)
342
+
343
+
344
+ def test_extract_pdf_text_empty_pdf():
345
+ """Test PDF text extraction with an empty PDF (no text content)."""
346
+ import fitz
347
+
348
+ # Create a temporary empty PDF
349
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
350
+ pdf_path = temp_pdf.name
351
+
352
+ try:
353
+ # Create an empty PDF
354
+ doc = fitz.open() # Create new PDF
355
+ page = doc.new_page() # Add empty page
356
+ doc.save(pdf_path)
357
+ doc.close()
358
+
359
+ # Test the extract_pdf_text function
360
+ result = extract_pdf_text(pdf_path)
361
+
362
+ # Should return message about no content
363
+ assert result == "No text content found in the PDF."
364
+
365
+ finally:
366
+ # Clean up
367
+ if os.path.exists(pdf_path):
368
+ os.unlink(pdf_path)
369
+
370
+
371
+ def test_extract_pdf_text_multipage():
372
+ """Test PDF text extraction with multiple pages."""
373
+ import fitz
374
+
375
+ # Create a temporary multi-page PDF
376
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
377
+ pdf_path = temp_pdf.name
378
+
379
+ try:
380
+ # Create a PDF with multiple pages
381
+ doc = fitz.open()
382
+
383
+ # Page 1
384
+ page1 = doc.new_page()
385
+ page1.insert_text((50, 100), "Content from page one.", fontsize=12)
386
+
387
+ # Page 2
388
+ page2 = doc.new_page()
389
+ page2.insert_text((50, 100), "Content from page two.", fontsize=12)
390
+
391
+ # Page 3 (empty)
392
+ page3 = doc.new_page()
393
+
394
+ # Page 4
395
+ page4 = doc.new_page()
396
+ page4.insert_text((50, 100), "Content from page four.", fontsize=12)
397
+
398
+ doc.save(pdf_path)
399
+ doc.close()
400
+
401
+ # Test the extract_pdf_text function
402
+ result = extract_pdf_text(pdf_path)
403
+
404
+ # Verify all pages with content are included
405
+ assert "Page 1:" in result
406
+ assert "Content from page one" in result
407
+ assert "Page 2:" in result
408
+ assert "Content from page two" in result
409
+ assert "Page 4:" in result
410
+ assert "Content from page four" in result
411
+
412
+ # Page 3 should be excluded (empty)
413
+ assert "Page 3:" not in result
414
+
415
+ # Check that pages are separated properly
416
+ assert "\n\n" in result # Pages should be separated by double newlines
417
+
418
+ finally:
419
+ # Clean up
420
+ if os.path.exists(pdf_path):
421
+ os.unlink(pdf_path)
422
+
423
+
424
+ def test_process_user_input_with_pdf():
425
+ """Test processing user input with a PDF file."""
426
+ import fitz
427
+
428
+ # Create a temporary PDF for testing
429
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
430
+ pdf_path = temp_pdf.name
431
+
432
+ try:
433
+ # Create a simple PDF
434
+ doc = fitz.open()
435
+ page = doc.new_page()
436
+ page.insert_text((50, 100), "Test PDF content for user input processing.", fontsize=12)
437
+ doc.save(pdf_path)
438
+ doc.close()
439
+
440
+ # Test processing user input with PDF
441
+ message = {
442
+ "text": "Analyze this PDF",
443
+ "files": [pdf_path]
444
+ }
445
+
446
+ result = process_user_input(message, 3)
447
+
448
+ # Should have 2 items (original text + PDF content)
449
+ assert len(result) == 2
450
+
451
+ # First item should be the message text
452
+ assert result[0]["type"] == "text"
453
+ assert result[0]["text"] == "Analyze this PDF"
454
+
455
+ # Second item should be PDF content
456
+ assert result[1]["type"] == "text"
457
+ assert "PDF Content:" in result[1]["text"]
458
+ assert "Test PDF content for user input processing" in result[1]["text"]
459
+ assert "Page 1:" in result[1]["text"]
460
+
461
+ finally:
462
+ # Clean up
463
+ if os.path.exists(pdf_path):
464
+ os.unlink(pdf_path)
465
+
466
+
467
+ def test_process_user_input_pdf_error_handling():
468
+ """Test that PDF processing errors are handled gracefully."""
469
+ # Create a file that looks like a PDF but isn't valid
470
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file:
471
+ temp_file.write(b"This is not a valid PDF file content")
472
+ invalid_pdf_path = temp_file.name
473
+
474
+ try:
475
+ message = {
476
+ "text": "Process invalid PDF",
477
+ "files": [invalid_pdf_path]
478
+ }
479
+
480
+ result = process_user_input(message, 3)
481
+
482
+ # Should have 2 items (original text + error message)
483
+ assert len(result) == 2
484
+
485
+ # First item should be the message text
486
+ assert result[0]["type"] == "text"
487
+ assert result[0]["text"] == "Process invalid PDF"
488
+
489
+ # Second item should be error message
490
+ assert result[1]["type"] == "text"
491
+ assert "Error processing PDF:" in result[1]["text"]
492
+
493
+ finally:
494
+ # Clean up
495
+ if os.path.exists(invalid_pdf_path):
496
+ os.unlink(invalid_pdf_path)
497
+
498
+
499
+ def test_process_history_with_pdf():
500
+ """Test that PDF files in history are handled correctly."""
501
+ import fitz
502
+
503
+ # Create a temporary PDF for testing
504
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
505
+ pdf_path = temp_pdf.name
506
+
507
+ try:
508
+ # Create a simple PDF
509
+ doc = fitz.open()
510
+ page = doc.new_page()
511
+ page.insert_text((50, 100), "Historical PDF content.", fontsize=12)
512
+ doc.save(pdf_path)
513
+ doc.close()
514
+
515
+ # Test history with PDF file
516
+ history = [
517
+ {"role": "user", "content": (pdf_path,)},
518
+ {"role": "user", "content": "What does this PDF contain?"},
519
+ {"role": "assistant", "content": "The PDF contains some text."},
520
+ {"role": "user", "content": "Thanks!"}
521
+ ]
522
+
523
+ result = process_history(history)
524
+
525
+ # Should have 3 messages (user turn, assistant turn, final user turn)
526
+ assert len(result) == 3
527
+
528
+ # First user turn should have PDF placeholder and text
529
+ assert result[0]["role"] == "user"
530
+ assert len(result[0]["content"]) == 2
531
+ assert result[0]["content"][0] == {"type": "text", "text": "[PDF uploaded previously]"}
532
+ assert result[0]["content"][1] == {"type": "text", "text": "What does this PDF contain?"}
533
+
534
+ # Assistant response
535
+ assert result[1]["role"] == "assistant"
536
+ assert result[1]["content"] == [{"type": "text", "text": "The PDF contains some text."}]
537
+
538
+ # Final user message
539
+ assert result[2]["role"] == "user"
540
+ assert result[2]["content"] == [{"type": "text", "text": "Thanks!"}]
541
+
542
+ finally:
543
+ # Clean up
544
+ if os.path.exists(pdf_path):
545
+ os.unlink(pdf_path)