From 009bf53345d5debfd9797d150e73044e36e77b96 Mon Sep 17 00:00:00 2001
From: BcKmini <akkn920@naver.com>
Date: Sun, 14 Sep 2025 01:00:54 +0900
Subject: [PATCH 1/2] 0914 01:00

---
 requirements.txt        |  22 ++-
 routers/file.py         | 225 ++++++++++++++++------------
 schemas/file.py         |  23 +++
 utils/ocr/__init__.py   |  15 ++
 utils/ocr/converters.py |  88 +++++++++++
 utils/ocr/ocr_core.py   | 324 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 594 insertions(+), 103 deletions(-)
 create mode 100644 schemas/file.py
 create mode 100644 utils/ocr/__init__.py
 create mode 100644 utils/ocr/converters.py
 create mode 100644 utils/ocr/ocr_core.py

diff --git a/requirements.txt b/requirements.txt
index f7e3b3f..495ed74 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,16 +3,16 @@ uvicorn
 pydantic
 sqlalchemy
 mysql-connector-python
-dotenv
+python-dotenv
 google-auth
 requests
 python-jose[cryptography]
 bcrypt
 
-torch==2.3.0+cu121
-torchaudio==2.3.0+cu121
-torchvision==0.18.0+cu121
---extra-index-url https://download.pytorch.org/whl/cu121
+# PyTorch (MacOS: CPU/MPS 빌드 자동 설치됨)
+torch==2.3.0
+torchvision==0.18.0
+torchaudio==2.3.0
 
 transformers>=4.40.0
 accelerate
@@ -20,4 +20,14 @@ sentencepiece
 protobuf
 python-multipart
 easyocr
-whisper
\ No newline at end of file
+whisper
+pytesseract
+pdf2image
+PyMuPDF
+python-docx
+
+langchain>=0.2.0
+langchain-community
+langchain-core
+langchain-openai
+langchain-ollama
diff --git a/routers/file.py b/routers/file.py
index 205132e..539f8db 100644
--- a/routers/file.py
+++ b/routers/file.py
@@ -1,58 +1,29 @@
 # ~/noteflow/Backend/routers/file.py
 
 import os
-import io
-import whisper
-model = whisper.load_model("base")
 from datetime import datetime
-import numpy as np
 from typing import Optional, List
-from urllib.parse import quote
 
-from fastapi import APIRouter, Depends, UploadFile, File, Form, HTTPException, status
+from fastapi import APIRouter, Depends, UploadFile, File, Form, HTTPException, status, Query, Response
 from fastapi.responses import FileResponse
 from sqlalchemy.orm import Session
-from PIL import Image
 
 from db import get_db
 from models.file import File as FileModel
 from models.note import Note as NoteModel
 from utils.jwt_utils import get_current_user
 
-# -------------------------------
-# 1) EasyOCR 라이브러리 임포트 (GPU 모드 활성화)
-# -------------------------------
-import easyocr
-reader = easyocr.Reader(["ko", "en"], gpu=True)
-
-# -------------------------------
-# 2) Hugging Face TrOCR 모델용 파이프라인 (GPU 사용)
-# -------------------------------
-from transformers import pipeline
-
-hf_trocr_printed = pipeline(
-    "image-to-text",
-    model="microsoft/trocr-base-printed",
-    device=0,
-    trust_remote_code=True
-)
-hf_trocr_handwritten = pipeline(
-    "image-to-text",
-    model="microsoft/trocr-base-handwritten",
-    device=0,
-    trust_remote_code=True
-)
-hf_trocr_small_printed = pipeline(
-    "image-to-text",
-    model="microsoft/trocr-small-printed",
-    device=0,
-    trust_remote_code=True
-)
-hf_trocr_large_printed = pipeline(
-    "image-to-text",
-    model="microsoft/trocr-large-printed",
-    device=0,
-    trust_remote_code=True
+# 추가/변경: 공통 OCR 파이프라인(thin wrapper)
+from utils.ocr import run_pipeline, detect_type
+from schemas.file import OCRResponse
+
+# 추가: 허용 확장자 상수 (불일치 시 200 + warnings 응답)
+ALLOWED_IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp"}
+ALLOWED_PDF_EXTS   = {".pdf"}
+ALLOWED_DOC_EXTS   = {".doc", ".docx"}
+ALLOWED_HWP_EXTS   = {".hwp"}
+ALLOWED_ALL_EXTS   = (
+    ALLOWED_IMAGE_EXTS | ALLOWED_PDF_EXTS | ALLOWED_DOC_EXTS | ALLOWED_HWP_EXTS
 )
 
 # 업로드 디렉토리 설정
@@ -65,6 +36,38 @@
 
 router = APIRouter(prefix="/api/v1/files", tags=["Files"])
 
+@router.get("/ocr/diag", summary="OCR 런타임 의존성 진단")
+def ocr_dependency_diag():
+    import shutil, subprocess
+    def which(cmd: str):
+        return shutil.which(cmd) is not None
+    def run(cmd: list[str]):
+        try:
+            out = subprocess.check_output(cmd, stderr=subprocess.STDOUT, timeout=5)
+            return out.decode(errors="ignore").strip()
+        except Exception as e:
+            return f"ERR: {e}"
+
+    tesseract_ok = which("tesseract")
+    poppler_ok = which("pdftoppm") or which("pdftocairo")
+    soffice_ok = which("soffice") or which("libreoffice")
+    hwp5txt_ok = which("hwp5txt")
+
+    langs = None
+    tess_ver = None
+    if tesseract_ok:
+        tess_ver = run(["tesseract", "--version"]).splitlines()[0] if tesseract_ok else None
+        langs_out = run(["tesseract", "--list-langs"])
+        langs = [l.strip() for l in langs_out.splitlines() if l and not l.lower().startswith("list of available")] if langs_out and not langs_out.startswith("ERR:") else None
+
+    return {
+        "tesseract": tesseract_ok,
+        "tesseract_version": tess_ver,
+        "tesseract_langs": langs,
+        "poppler": poppler_ok,
+        "libreoffice": soffice_ok,
+        "hwp5txt": hwp5txt_ok,
+    }
 
 @router.post(
     "/upload",
@@ -194,73 +197,94 @@ def download_file(
 
 @router.post(
     "/ocr",
-    summary="이미지 OCR → 텍스트 변환 후 노트 생성",
-    response_model=dict
+    summary="이미지/PDF/DOC/DOCX/HWP OCR → 텍스트 변환 후 노트 생성",
+    response_model=OCRResponse
 )
 async def ocr_and_create_note(
-    ocr_file: UploadFile = File(...),
+    # 변경: 업로드 필드명 'file' 기본 + 과거 호환 'ocr_file' 동시 허용
+    file: Optional[UploadFile] = File(None, description="기본 업로드 필드명"),
+    ocr_file: Optional[UploadFile] = File(None, description="과거 호환 업로드 필드명"),
     folder_id: Optional[int] = Form(None),
+    langs: str = Query("kor+eng", description="Tesseract 언어코드(예: kor+eng)"),
+    max_pages: int = Query(50, ge=1, le=500, description="최대 처리 페이지 수(기본 50)"),
     db: Session = Depends(get_db),
     current_user = Depends(get_current_user)
 ):
     """
-    • EasyOCR + TrOCR 모델로 이미지에서 텍스트 추출
-    • 가장 긴 결과를 선택해 새 노트로 저장
+    변경 전: 이미지 전용 EasyOCR/TrOCR로 텍스트 추출 후 노트 생성.
+    변경 후(추가/변경): 공통 파이프라인(utils.ocr.run_pipeline)으로 이미지/PDF/DOC/DOCX/HWP 처리.
+    - 예외는 200으로 내려가며, results=[] + warnings에 사유 기입.
+    - 결과 텍스트를 합쳐 비어있지 않으면 기존과 동일하게 노트를 생성.
     """
-    # 1) 이미지 로드
-    contents = await ocr_file.read()
-    try:
-        image = Image.open(io.BytesIO(contents)).convert("RGB")
-    except Exception as e:
-        raise HTTPException(status_code=400, detail=f"이미지 처리 실패: {e}")
+    # 업로드 파일 결정
+    upload = file or ocr_file
+    if upload is None:
+        raise HTTPException(status_code=400, detail="업로드 파일이 필요합니다. 필드명은 'file' 또는 'ocr_file'을 사용하세요.")
+
+    filename = upload.filename or "uploaded"
+    mime = upload.content_type
+
+    # 허용 확장자 확인 (불일치 시 200 + warnings)
+    _, ext = os.path.splitext(filename)
+    ext = ext.lower()
+    if ext and ext not in ALLOWED_ALL_EXTS:
+        return OCRResponse(
+            filename=filename,
+            mime=mime,
+            page_count=0,
+            results=[],
+            warnings=[f"허용되지 않는 확장자({ext}). 허용: {sorted(ALLOWED_ALL_EXTS)}"],
+            note_id=None,
+            text=None,
+        )
 
-    # 2) EasyOCR
-    try:
-        image_np = np.array(image)
-        easy_results = reader.readtext(image_np)
-        easy_text = " ".join([res[1] for res in easy_results])
-    except Exception:
-        easy_text = ""
-
-    # 3) TrOCR 4개 모델
-    hf_texts: List[str] = []
-    try:
-        for pipe in (
-            hf_trocr_printed,
-            hf_trocr_handwritten,
-            hf_trocr_small_printed,
-            hf_trocr_large_printed
-        ):
-            out = pipe(image)
-            if isinstance(out, list) and "generated_text" in out[0]:
-                hf_texts.append(out[0]["generated_text"].strip())
-    except Exception:
-        pass
-
-    # 4) 가장 긴 결과 선택
-    candidates = [t for t in [easy_text] + hf_texts if t and t.strip()]
-    if not candidates:
-        raise HTTPException(status_code=500, detail="텍스트를 인식할 수 없습니다.")
-    ocr_text = max(candidates, key=len)
-
-    # 5) Note 생성
-    try:
-        new_note = NoteModel(
-            user_id=current_user.u_id,
-            folder_id=folder_id,
-            title="OCR 결과",
-            content=ocr_text
+    # 타입 판별 (보조적으로 unknown 방지)
+    ftype = detect_type(filename, mime)
+    if ftype == "unknown":
+        return OCRResponse(
+            filename=filename,
+            mime=mime,
+            page_count=0,
+            results=[],
+            warnings=["지원되지 않는 파일 형식입니다."],
+            note_id=None,
+            text=None,
         )
-        db.add(new_note)
-        db.commit()
-        db.refresh(new_note)
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"노트 저장 실패: {e}")
 
-    return {
-        "note_id": new_note.id,
-        "text": ocr_text
-    }
+    data = await upload.read()
+
+    pipe = run_pipeline(
+        filename=filename,
+        mime=mime,
+        data=data,
+        langs=langs,
+        max_pages=max_pages,
+    )
+
+    merged_text = "\n\n".join([
+        item.get("text", "") for item in (pipe.get("results") or []) if item.get("text")
+    ]).strip()
+
+    note_id: Optional[int] = None
+    if merged_text:
+        try:
+            new_note = NoteModel(
+                user_id=current_user.u_id,
+                folder_id=folder_id,
+                title="OCR 결과",
+                content=merged_text,
+            )
+            db.add(new_note)
+            db.commit()
+            db.refresh(new_note)
+            note_id = new_note.id
+        except Exception as e:
+            (pipe.setdefault("warnings", [])).append(f"노트 저장 실패: {e}")
+
+    pipe["note_id"] = note_id
+    pipe["text"] = merged_text or None
+
+    return pipe
 
 
 @router.post("/audio")
@@ -338,3 +362,10 @@ async def upload_audio_and_transcribe(
         "message": "STT 및 노트 저장 완료",
         "transcript": transcript
     }
+@router.options("/ocr")
+def ocr_cors_preflight() -> Response:
+    """CORS preflight용 OPTIONS 응답. 일부 프록시/클라이언트에서 405 회피.
+    변경 전: 별도 OPTIONS 라우트 없음(미들웨어에 의존)
+    변경 후(추가): 명시적으로 200을 반환
+    """
+    return Response(status_code=200)
diff --git a/schemas/file.py b/schemas/file.py
new file mode 100644
index 0000000..e329d89
--- /dev/null
+++ b/schemas/file.py
@@ -0,0 +1,23 @@
+from __future__ import annotations
+
+from typing import List, Optional
+from pydantic import BaseModel, Field
+
+
+class OCRResultItem(BaseModel):
+    page: int
+    text: str
+
+
+class OCRResponse(BaseModel):
+    # 신규 필드(추가/변경): 공통 파이프라인 메타
+    filename: str
+    mime: Optional[str] = None
+    page_count: int
+    results: List[OCRResultItem] = Field(default_factory=list)
+    warnings: List[str] = Field(default_factory=list)
+
+    # 하위 호환(변경 전 응답 유지): 기존 이미지 OCR 응답
+    note_id: Optional[int] = None
+    text: Optional[str] = None
+
diff --git a/utils/ocr/__init__.py b/utils/ocr/__init__.py
new file mode 100644
index 0000000..5db369a
--- /dev/null
+++ b/utils/ocr/__init__.py
@@ -0,0 +1,15 @@
+"""
+utils.ocr 패키지
+
+추가/변경 요약
+- 공통 OCR 파이프라인 진입점(run_pipeline)을 외부에 노출
+- 이미지/PDF/DOC/DOCX/HWP를 단일 인터페이스로 처리
+"""
+
+from .ocr_core import run_pipeline, detect_type
+
+__all__ = [
+    "run_pipeline",
+    "detect_type",
+]
+
diff --git a/utils/ocr/converters.py b/utils/ocr/converters.py
new file mode 100644
index 0000000..7455d16
--- /dev/null
+++ b/utils/ocr/converters.py
@@ -0,0 +1,88 @@
+"""
+utils/ocr/converters.py
+
+추가/변경
+- PDF/DOC/DOCX/HWP를 파이프라인에서 재사용할 수 있도록 변환/추출 유틸 제공
+- 외부 의존(soffice, hwp5txt)이 없을 수 있으므로 항상 예외를 던지지 말고 상위에서 warnings에 기록
+"""
+
+from __future__ import annotations
+
+import os
+import shutil
+import subprocess
+import tempfile
+from typing import List
+
+from PIL import Image
+
+
+def save_bytes_to_temp(data: bytes, suffix: str = "") -> str:
+    """바이트를 임시 파일로 저장하고 경로를 반환.
+    호출자가 삭제를 책임짐.
+    """
+    fd, path = tempfile.mkstemp(suffix=suffix)
+    with os.fdopen(fd, "wb") as f:
+        f.write(data)
+    return path
+
+
+def pdf_to_images(pdf_path: str, dpi: int = 200) -> List[Image.Image]:
+    """pdf2image.convert_from_path로 PDF를 PIL 이미지 리스트로 변환.
+    주: 시스템에 poppler가 필요할 수 있음.
+    """
+    from pdf2image import convert_from_path  # 지연 임포트
+    images = convert_from_path(pdf_path, dpi=dpi)
+    return images
+
+
+def office_to_pdf(input_path: str, outdir: str) -> str:
+    """LibreOffice(soffice)를 사용하여 DOC/DOCX를 PDF로 변환.
+    반환: 변환된 PDF 경로
+    실패 시 예외 발생(상위에서 warnings 처리)
+    """
+    soffice = shutil.which("soffice") or shutil.which("libreoffice")
+    if not soffice:
+        raise RuntimeError("LibreOffice(soffice) 실행 파일을 찾을 수 없습니다.")
+
+    cmd = [
+        soffice,
+        "--headless",
+        "--convert-to",
+        "pdf",
+        "--outdir",
+        outdir,
+        input_path,
+    ]
+    proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    if proc.returncode != 0:
+        raise RuntimeError(
+            f"LibreOffice 변환 실패: {proc.stderr.decode(errors='ignore')[:300]}"
+        )
+
+    base = os.path.splitext(os.path.basename(input_path))[0]
+    pdf_path = os.path.join(outdir, f"{base}.pdf")
+    if not os.path.exists(pdf_path):
+        # 일부 환경에서 출력 파일명이 다르게 생성될 수 있어 재탐색
+        candidates = [p for p in os.listdir(outdir) if p.lower().endswith(".pdf")]
+        if candidates:
+            pdf_path = os.path.join(outdir, candidates[0])
+    if not os.path.exists(pdf_path):
+        raise RuntimeError("PDF 결과 파일이 생성되지 않았습니다.")
+    return pdf_path
+
+
+def hwp_to_text(input_path: str) -> str:
+    """hwp5txt(또는 pyhwp)로 HWP 텍스트를 추출.
+    주: hwp5txt CLI가 설치되어 있어야 함. 없으면 예외.
+    """
+    hwp5txt = shutil.which("hwp5txt")
+    if not hwp5txt:
+        raise RuntimeError("hwp5txt 실행 파일을 찾을 수 없습니다.")
+    proc = subprocess.run([hwp5txt, input_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    if proc.returncode != 0:
+        raise RuntimeError(
+            f"hwp5txt 추출 실패: {proc.stderr.decode(errors='ignore')[:300]}"
+        )
+    return proc.stdout.decode(errors="ignore")
+
diff --git a/utils/ocr/ocr_core.py b/utils/ocr/ocr_core.py
new file mode 100644
index 0000000..c15a315
--- /dev/null
+++ b/utils/ocr/ocr_core.py
@@ -0,0 +1,324 @@
+"""
+utils/ocr/ocr_core.py
+
+추가/변경
+- 파일 타입 판별(확장자 우선, MIME 보조) 및 통합 OCR 파이프라인(run_pipeline) 구현
+- 이미지: pytesseract 기본 OCR, (기존) EasyOCR/TrOCR는 가능 시 보조로 시도하여 최적 텍스트 선택
+- PDF: pdf2image(convert_from_path, dpi=200)로 페이지 이미지를 생성하여 페이지별 OCR
+- DOC/DOCX: LibreOffice(soffice --headless)로 PDF로 변환 후 PDF 파이프라인 재사용
+- HWP: hwp5txt로 텍스트 추출(성공 시 page=1로 results에 추가), 실패 시 warnings 기록
+- 대용량 제어: MAX_PAGES(기본 50)까지 처리하고 잘린 경우 warnings 기록
+- 예외는 raise하지 않고 results=[], warnings로 사유를 담아 상위가 200으로 응답할 수 있게 함
+"""
+
+from __future__ import annotations
+
+import io
+import os
+from typing import Dict, List, Optional, Tuple
+
+from PIL import Image
+
+from .converters import (
+    save_bytes_to_temp,
+    pdf_to_images,
+    office_to_pdf,
+    hwp_to_text,
+)
+
+
+# 지원 확장자 세트
+IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp"}
+PDF_EXTS = {".pdf"}
+DOC_EXTS = {".doc", ".docx"}
+HWP_EXTS = {".hwp"}
+
+
+def detect_type(filename: str, content_type: Optional[str]) -> str:
+    """확장자 기반 타입 판별, MIME은 보조.
+    반환: "image" | "pdf" | "docx" | "hwp" | "unknown"
+    """
+    ext = os.path.splitext(filename or "")[1].lower()
+    if ext in IMAGE_EXTS:
+        return "image"
+    if ext in PDF_EXTS:
+        return "pdf"
+    if ext in DOC_EXTS:
+        return "docx"  # 내부적으로 DOC/DOCX를 동일 경로로 처리
+    if ext in HWP_EXTS:
+        return "hwp"
+
+    # MIME 보조 판단(간단히)
+    if content_type:
+        ct = content_type.lower()
+        if ct.startswith("image/"):
+            return "image"
+        if ct == "application/pdf":
+            return "pdf"
+        if ct in ("application/msword", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"):
+            return "docx"
+        if "hwp" in ct:
+            return "hwp"
+    return "unknown"
+
+
+def _ocr_image_pytesseract(img: Image.Image, langs: str, warnings: List[str]) -> str:
+    """pytesseract를 사용하여 이미지에서 텍스트 추출.
+    주: 시스템에 tesseract OCR 엔진 및 언어 데이터가 설치되어 있어야 함.
+    """
+    try:
+        import pytesseract
+        text = pytesseract.image_to_string(img, lang=langs)
+        return text.strip()
+    except Exception as e:
+        warnings.append(f"pytesseract OCR 실패: {e}")
+        return ""
+
+
+def _ocr_image_legacy(img: Image.Image, warnings: List[str]) -> str:
+    """기존 이미지 OCR(EasyOCR + TrOCR) 로직 재사용.
+    - 환경/의존성에 따라 실패할 수 있으므로 예외는 warnings에만 기록.
+    - 기존 구현과 동일하게 가장 긴 텍스트를 선택.
+    """
+    try:
+        import numpy as np
+        import easyocr
+        from transformers import pipeline
+    except Exception as e:
+        warnings.append(f"기존 OCR 모듈(EasyOCR/TrOCR) 사용 불가: {e}")
+        return ""
+
+    try:
+        # EasyOCR
+        reader = easyocr.Reader(["ko", "en"], gpu=False)
+        image_np = np.array(img.convert("RGB"))
+        easy_results = reader.readtext(image_np)
+        easy_text = " ".join([res[1] for res in easy_results])
+    except Exception as e:
+        warnings.append(f"EasyOCR 실패: {e}")
+        easy_text = ""
+
+    hf_texts: List[str] = []
+    try:
+        for model_name in (
+            "microsoft/trocr-base-printed",
+            "microsoft/trocr-base-handwritten",
+            "microsoft/trocr-small-printed",
+            "microsoft/trocr-large-printed",
+        ):
+            try:
+                pipe = pipeline("image-to-text", model=model_name, trust_remote_code=True)
+                out = pipe(img)
+                if isinstance(out, list) and out and isinstance(out[0], dict) and "generated_text" in out[0]:
+                    hf_texts.append(out[0]["generated_text"].strip())
+            except Exception as e:
+                warnings.append(f"TrOCR({model_name}) 실패: {e}")
+    except Exception as e:
+        warnings.append(f"TrOCR 파이프라인 초기화 실패: {e}")
+
+    candidates = [t for t in [easy_text] + hf_texts if t and t.strip()]
+    if not candidates:
+        return ""
+    return max(candidates, key=len)
+
+
+def _ocr_image_best(img: Image.Image, langs: str, warnings: List[str]) -> str:
+    """
+    변경(모델 우선): 기존(EasyOCR/TrOCR) → pytesseract 순으로 시도하고 더 긴 텍스트 선택.
+    - 서버에 Tesseract가 없어도 동작하도록 모델 기반 경로를 우선.
+    """
+    legacy_text = _ocr_image_legacy(img, warnings)
+    tesseract_text = _ocr_image_pytesseract(img, langs, warnings)
+
+    candidates = [t for t in [legacy_text, tesseract_text] if t]
+    if not candidates:
+        return ""
+    return max(candidates, key=len)
+
+
+def run_pipeline(
+    filename: str,
+    mime: Optional[str],
+    data: bytes,
+    langs: str = "kor+eng",
+    max_pages: int = 50,
+) -> Dict:
+    """공통 OCR 파이프라인
+
+    반환 JSON 스키마:
+    {
+      "filename": str,
+      "mime": str | null,
+      "page_count": int,
+      "results": [{"page": int, "text": str}],
+      "warnings": [str]
+    }
+
+    예외는 raise하지 않고 warnings에만 기록 후 results를 비워서 반환.
+    """
+    warnings: List[str] = []
+    results: List[Dict] = []
+    page_count = 0
+
+    ftype = detect_type(filename, mime)
+
+    try:
+        if ftype == "image":
+            # 단일 이미지 → 페이지 1로 간주
+            try:
+                img = Image.open(io.BytesIO(data)).convert("RGB")
+            except Exception as e:
+                warnings.append(f"이미지 열기 실패: {e}")
+                img = None
+
+            if img is not None:
+                text = _ocr_image_best(img, langs, warnings)
+                page_count = 1
+                results.append({"page": 1, "text": text or ""})
+
+        elif ftype == "pdf":
+            # 변경: PyMuPDF(fitz) 우선 사용 → 네이티브 텍스트, 없으면 렌더링 후 모델 OCR
+            images: List[Image.Image] = []
+            try:
+                import fitz  # PyMuPDF
+                doc = fitz.open(stream=data, filetype="pdf")
+                total = doc.page_count
+                if total > max_pages:
+                    warnings.append(f"페이지가 {max_pages}장을 초과하여 앞 {max_pages}페이지만 처리합니다.")
+                limit = min(total, max_pages)
+                for i in range(limit):
+                    page = doc.load_page(i)
+                    txt = (page.get_text("text") or "").strip()
+                    if txt:
+                        results.append({"page": i + 1, "text": txt})
+                    else:
+                        # 이미지 렌더링 후 모델 OCR
+                        try:
+                            mat = fitz.Matrix(2, 2)  # ~144 DPI 정도
+                            pix = page.get_pixmap(matrix=mat)
+                            mode = "RGBA" if pix.alpha else "RGB"
+                            img = Image.frombytes(mode, [pix.width, pix.height], pix.samples)
+                            if mode == "RGBA":
+                                img = img.convert("RGB")
+                            images.append(img)
+                        except Exception as e:
+                            warnings.append(f"PDF 페이지 렌더링 실패(page {i+1}): {e}")
+                page_count = limit
+            except Exception as e:
+                warnings.append(f"PyMuPDF 처리 실패: {e}")
+                # 대체 경로: pdf2image(poppler 필요)
+                pdf_path = save_bytes_to_temp(data, suffix=".pdf")
+                try:
+                    images = pdf_to_images(pdf_path, dpi=200)
+                except Exception as ee:
+                    warnings.append(f"PDF를 이미지로 변환 실패: {ee}")
+                    images = []
+                finally:
+                    try:
+                        os.remove(pdf_path)
+                    except Exception:
+                        pass
+                total = len(images)
+                if total > max_pages:
+                    warnings.append(f"페이지가 {max_pages}장을 초과하여 앞 {max_pages}페이지만 처리합니다.")
+                    images = images[:max_pages]
+                page_count = len(images)
+
+            # 이미지에 대해 모델 OCR 수행 (필요한 페이지만)
+            for idx, img in enumerate(images, start=1):
+                text = _ocr_image_best(img, langs, warnings)
+                results.append({"page": idx, "text": text or ""})
+
+        elif ftype == "docx":
+            # 변경: .docx는 python-docx로 네이티브 텍스트 추출 우선, .doc는 LibreOffice 변환
+            ext = os.path.splitext(filename or "")[1].lower()
+            if ext == ".docx":
+                try:
+                    from docx import Document  # python-docx
+                    doc = Document(io.BytesIO(data))
+                    paras = []
+                    for p in doc.paragraphs:
+                        if p.text:
+                            paras.append(p.text)
+                    text = "\n".join(paras).strip()
+                    if text:
+                        results.append({"page": 1, "text": text})
+                        page_count = 1
+                    else:
+                        warnings.append("DOCX에서 추출된 텍스트가 없습니다.")
+                except Exception as e:
+                    warnings.append(f"python-docx 처리 실패: {e}")
+            else:
+                # 구형 .doc → LibreOffice로 PDF 변환 후 OCR
+                in_path = save_bytes_to_temp(data, suffix=ext or ".doc")
+                outdir = os.path.dirname(in_path)
+                pdf_path: Optional[str] = None
+                try:
+                    pdf_path = office_to_pdf(in_path, outdir)
+                    # PDF 처리 동일 (PyMuPDF 경로 우선)
+                    try:
+                        import fitz
+                        doc = fitz.open(pdf_path)
+                        total = doc.page_count
+                        if total > max_pages:
+                            warnings.append(f"페이지가 {max_pages}장을 초과하여 앞 {max_pages}페이지만 처리합니다.")
+                        limit = min(total, max_pages)
+                        for i in range(limit):
+                            page = doc.load_page(i)
+                            txt = (page.get_text("text") or "").strip()
+                            if txt:
+                                results.append({"page": i + 1, "text": txt})
+                            else:
+                                mat = fitz.Matrix(2, 2)
+                                pix = page.get_pixmap(matrix=mat)
+                                mode = "RGBA" if pix.alpha else "RGB"
+                                img = Image.frombytes(mode, [pix.width, pix.height], pix.samples)
+                                if mode == "RGBA":
+                                    img = img.convert("RGB")
+                                t = _ocr_image_best(img, langs, warnings)
+                                results.append({"page": i + 1, "text": t or ""})
+                        page_count = limit
+                    except Exception as e:
+                        warnings.append(f"DOC→PDF 처리 후 읽기 실패: {e}")
+                except Exception as e:
+                    warnings.append(f"DOC 변환 실패: {e}")
+                finally:
+                    try:
+                        os.remove(in_path)
+                    except Exception:
+                        pass
+                    if pdf_path:
+                        try:
+                            os.remove(pdf_path)
+                        except Exception:
+                            pass
+
+        elif ftype == "hwp":
+            # HWP → hwp5txt 1차 시도. 성공 시 page=1
+            in_path = save_bytes_to_temp(data, suffix=".hwp")
+            try:
+                text = hwp_to_text(in_path)
+                results.append({"page": 1, "text": (text or "").strip()})
+                page_count = 1
+            except Exception as e:
+                warnings.append(f"HWP 텍스트 추출 실패: {e}")
+            finally:
+                try:
+                    os.remove(in_path)
+                except Exception:
+                    pass
+
+        else:
+            warnings.append("지원되지 않는 파일 형식입니다.")
+
+    except Exception as e:
+        # 상위에서 200으로 내려줄 수 있도록 전체 예외 흡수
+        warnings.append(f"파이프라인 실행 오류: {e}")
+
+    return {
+        "filename": filename,
+        "mime": mime,
+        "page_count": page_count,
+        "results": results,
+        "warnings": warnings,
+    }

From 8cc2056b665a513469714f698c623526bc85b42c Mon Sep 17 00:00:00 2001
From: BcKmini <akkn920@naver.com>
Date: Sun, 14 Sep 2025 03:49:19 +0900
Subject: [PATCH 2/2] 0914 03:49

---
 .github/workflows/ci.yml     | 41 +++++++++++++++++++++++++++
 .github/workflows/docker.yml | 35 +++++++++++++++++++++++
 Dockerfile                   | 20 +++++++++++++
 README.md                    | 55 ++++++++++++++++++++++++++++++++++++
 routers/file.py              |  4 ++-
 5 files changed, 154 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/ci.yml
 create mode 100644 .github/workflows/docker.yml
 create mode 100644 Dockerfile
 create mode 100644 README.md

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..bbc00c9
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,41 @@
+name: CI
+
+on:
+  pull_request:
+    branches: [ main ]
+  push:
+    branches: [ main ]
+    paths:
+      - '**/*.py'
+      - 'requirements.txt'
+      - '.github/workflows/ci.yml'
+
+jobs:
+  lint-test:
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+          cache: 'pip'
+
+      - name: Install deps
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Syntax check
+        run: |
+          python -m py_compile $(git ls-files '*.py' | tr '\n' ' ')
+
+      - name: Import smoke
+        run: |
+          python - << 'PY'
+          from importlib import import_module
+          import_module('main')
+          print('Import OK')
+          PY
+
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
new file mode 100644
index 0000000..1c8c292
--- /dev/null
+++ b/.github/workflows/docker.yml
@@ -0,0 +1,35 @@
+name: Docker Build & Push (Backend)
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - '**'
+      - '!README.md'
+
+jobs:
+  docker:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Log in to GHCR
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and push
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: ./Dockerfile
+          push: true
+          tags: |
+            ghcr.io/${{ github.repository }}:backend-latest
+            ghcr.io/${{ github.repository }}:backend-${{ github.sha }}
+
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..99b6d62
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,20 @@
+FROM python:3.11-slim
+
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1
+
+WORKDIR /app
+
+# System dependencies for pdf2image
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    poppler-utils \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY Backend/requirements.txt ./requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY Backend/ ./
+
+EXPOSE 8080
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"]
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..c1a4caf
--- /dev/null
+++ b/README.md
@@ -0,0 +1,55 @@
+# Noteflow Backend (FastAPI)
+
+## Overview
+- FastAPI backend for Noteflow
+- OCR pipeline supports images, PDF, DOC/DOCX, HWP (via utilities and system tools)
+
+## Run (local)
+```
+python -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+uvicorn main:app --host 0.0.0.0 --port 8080 --reload
+```
+
+Env (optional):
+- `SECRET_KEY`, `ACCESS_TOKEN_EXPIRE_MINUTES`
+- Database URLs if you connect a DB (current code uses provided models)
+
+## OCR system tools (optional but recommended)
+- PyMuPDF (Python) used by default for PDF text extraction
+- Optional fallbacks/tools:
+  - Poppler (`pdftoppm`) for `pdf2image`
+  - LibreOffice (`soffice`) for .doc → .pdf
+  - `hwp5txt` for .hwp text extraction
+- If missing, the API still returns 200 with `warnings` explaining limitations.
+
+## API Highlights
+- `POST /api/v1/files/ocr` — OCR and create note (accepts file + optional `folder_id`, `langs`, `max_pages`)
+- `POST /api/v1/files/upload` — Upload files to folder
+- `POST /api/v1/files/audio` — STT from audio, create/append to note
+
+## CI (GitHub Actions)
+- This folder includes `.github/workflows/ci.yml` to lint/smoke-test on push/PR.
+- Python 3.11, `pip install -r requirements.txt`, syntax check and import smoke.
+
+## Docker (optional; for later)
+- Dockerfile included. Build & run locally:
+```
+docker build -t noteflow-backend .
+docker run --rm -p 8080:8080 noteflow-backend
+```
+- GitHub Actions container build:
+  - `.github/workflows/docker.yml` pushes to GHCR:
+    - `ghcr.io/<owner>/<repo>:backend-latest`
+    - `ghcr.io/<owner>/<repo>:backend-<sha>`
+- Deployment example (SSH) once you’re ready:
+```
+docker login ghcr.io -u <USER> -p <TOKEN>
+docker pull ghcr.io/<owner>/<repo>:backend-latest
+docker run -d --name backend --restart=always -p 8080:8080 ghcr.io/<owner>/<repo>:backend-latest
+```
+
+## Notes
+- If you split this folder into its own repository root, the included `.github/workflows/*.yml` files will work as-is.
+- OCR uses model-first path (EasyOCR + TrOCR) and falls back to tesseract when available.
diff --git a/routers/file.py b/routers/file.py
index 539f8db..1a43a25 100644
--- a/routers/file.py
+++ b/routers/file.py
@@ -268,10 +268,12 @@ async def ocr_and_create_note(
     note_id: Optional[int] = None
     if merged_text:
         try:
+            # 추가/변경: 노트 제목을 업로드한 파일 이름으로 설정 (확장자 제거)
+            base_title = os.path.splitext(filename)[0].strip() or "OCR 결과"
             new_note = NoteModel(
                 user_id=current_user.u_id,
                 folder_id=folder_id,
-                title="OCR 결과",
+                title=base_title,
                 content=merged_text,
             )
             db.add(new_note)