From 009bf53345d5debfd9797d150e73044e36e77b96 Mon Sep 17 00:00:00 2001 From: BcKmini Date: Sun, 14 Sep 2025 01:00:54 +0900 Subject: [PATCH 1/2] 0914 01:00 --- requirements.txt | 22 ++- routers/file.py | 225 ++++++++++++++++------------ schemas/file.py | 23 +++ utils/ocr/__init__.py | 15 ++ utils/ocr/converters.py | 88 +++++++++++ utils/ocr/ocr_core.py | 324 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 594 insertions(+), 103 deletions(-) create mode 100644 schemas/file.py create mode 100644 utils/ocr/__init__.py create mode 100644 utils/ocr/converters.py create mode 100644 utils/ocr/ocr_core.py diff --git a/requirements.txt b/requirements.txt index f7e3b3f..495ed74 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,16 +3,16 @@ uvicorn pydantic sqlalchemy mysql-connector-python -dotenv +python-dotenv google-auth requests python-jose[cryptography] bcrypt -torch==2.3.0+cu121 -torchaudio==2.3.0+cu121 -torchvision==0.18.0+cu121 ---extra-index-url https://download.pytorch.org/whl/cu121 +# PyTorch (MacOS: CPU/MPS 빌드 자동 설치됨) +torch==2.3.0 +torchvision==0.18.0 +torchaudio==2.3.0 transformers>=4.40.0 accelerate @@ -20,4 +20,14 @@ sentencepiece protobuf python-multipart easyocr -whisper \ No newline at end of file +whisper +pytesseract +pdf2image +PyMuPDF +python-docx + +langchain>=0.2.0 +langchain-community +langchain-core +langchain-openai +langchain-ollama diff --git a/routers/file.py b/routers/file.py index 205132e..539f8db 100644 --- a/routers/file.py +++ b/routers/file.py @@ -1,58 +1,29 @@ # ~/noteflow/Backend/routers/file.py import os -import io -import whisper -model = whisper.load_model("base") from datetime import datetime -import numpy as np from typing import Optional, List -from urllib.parse import quote -from fastapi import APIRouter, Depends, UploadFile, File, Form, HTTPException, status +from fastapi import APIRouter, Depends, UploadFile, File, Form, HTTPException, status, Query, Response from fastapi.responses import FileResponse from sqlalchemy.orm import Session -from PIL import Image from db import get_db from models.file import File as FileModel from models.note import Note as NoteModel from utils.jwt_utils import get_current_user -# ------------------------------- -# 1) EasyOCR 라이브러리 임포트 (GPU 모드 활성화) -# ------------------------------- -import easyocr -reader = easyocr.Reader(["ko", "en"], gpu=True) - -# ------------------------------- -# 2) Hugging Face TrOCR 모델용 파이프라인 (GPU 사용) -# ------------------------------- -from transformers import pipeline - -hf_trocr_printed = pipeline( - "image-to-text", - model="microsoft/trocr-base-printed", - device=0, - trust_remote_code=True -) -hf_trocr_handwritten = pipeline( - "image-to-text", - model="microsoft/trocr-base-handwritten", - device=0, - trust_remote_code=True -) -hf_trocr_small_printed = pipeline( - "image-to-text", - model="microsoft/trocr-small-printed", - device=0, - trust_remote_code=True -) -hf_trocr_large_printed = pipeline( - "image-to-text", - model="microsoft/trocr-large-printed", - device=0, - trust_remote_code=True +# 추가/변경: 공통 OCR 파이프라인(thin wrapper) +from utils.ocr import run_pipeline, detect_type +from schemas.file import OCRResponse + +# 추가: 허용 확장자 상수 (불일치 시 200 + warnings 응답) +ALLOWED_IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp"} +ALLOWED_PDF_EXTS = {".pdf"} +ALLOWED_DOC_EXTS = {".doc", ".docx"} +ALLOWED_HWP_EXTS = {".hwp"} +ALLOWED_ALL_EXTS = ( + ALLOWED_IMAGE_EXTS | ALLOWED_PDF_EXTS | ALLOWED_DOC_EXTS | ALLOWED_HWP_EXTS ) # 업로드 디렉토리 설정 @@ -65,6 +36,38 @@ router = APIRouter(prefix="/api/v1/files", tags=["Files"]) +@router.get("/ocr/diag", summary="OCR 런타임 의존성 진단") +def ocr_dependency_diag(): + import shutil, subprocess + def which(cmd: str): + return shutil.which(cmd) is not None + def run(cmd: list[str]): + try: + out = subprocess.check_output(cmd, stderr=subprocess.STDOUT, timeout=5) + return out.decode(errors="ignore").strip() + except Exception as e: + return f"ERR: {e}" + + tesseract_ok = which("tesseract") + poppler_ok = which("pdftoppm") or which("pdftocairo") + soffice_ok = which("soffice") or which("libreoffice") + hwp5txt_ok = which("hwp5txt") + + langs = None + tess_ver = None + if tesseract_ok: + tess_ver = run(["tesseract", "--version"]).splitlines()[0] if tesseract_ok else None + langs_out = run(["tesseract", "--list-langs"]) + langs = [l.strip() for l in langs_out.splitlines() if l and not l.lower().startswith("list of available")] if langs_out and not langs_out.startswith("ERR:") else None + + return { + "tesseract": tesseract_ok, + "tesseract_version": tess_ver, + "tesseract_langs": langs, + "poppler": poppler_ok, + "libreoffice": soffice_ok, + "hwp5txt": hwp5txt_ok, + } @router.post( "/upload", @@ -194,73 +197,94 @@ def download_file( @router.post( "/ocr", - summary="이미지 OCR → 텍스트 변환 후 노트 생성", - response_model=dict + summary="이미지/PDF/DOC/DOCX/HWP OCR → 텍스트 변환 후 노트 생성", + response_model=OCRResponse ) async def ocr_and_create_note( - ocr_file: UploadFile = File(...), + # 변경: 업로드 필드명 'file' 기본 + 과거 호환 'ocr_file' 동시 허용 + file: Optional[UploadFile] = File(None, description="기본 업로드 필드명"), + ocr_file: Optional[UploadFile] = File(None, description="과거 호환 업로드 필드명"), folder_id: Optional[int] = Form(None), + langs: str = Query("kor+eng", description="Tesseract 언어코드(예: kor+eng)"), + max_pages: int = Query(50, ge=1, le=500, description="최대 처리 페이지 수(기본 50)"), db: Session = Depends(get_db), current_user = Depends(get_current_user) ): """ - • EasyOCR + TrOCR 모델로 이미지에서 텍스트 추출 - • 가장 긴 결과를 선택해 새 노트로 저장 + 변경 전: 이미지 전용 EasyOCR/TrOCR로 텍스트 추출 후 노트 생성. + 변경 후(추가/변경): 공통 파이프라인(utils.ocr.run_pipeline)으로 이미지/PDF/DOC/DOCX/HWP 처리. + - 예외는 200으로 내려가며, results=[] + warnings에 사유 기입. + - 결과 텍스트를 합쳐 비어있지 않으면 기존과 동일하게 노트를 생성. """ - # 1) 이미지 로드 - contents = await ocr_file.read() - try: - image = Image.open(io.BytesIO(contents)).convert("RGB") - except Exception as e: - raise HTTPException(status_code=400, detail=f"이미지 처리 실패: {e}") + # 업로드 파일 결정 + upload = file or ocr_file + if upload is None: + raise HTTPException(status_code=400, detail="업로드 파일이 필요합니다. 필드명은 'file' 또는 'ocr_file'을 사용하세요.") + + filename = upload.filename or "uploaded" + mime = upload.content_type + + # 허용 확장자 확인 (불일치 시 200 + warnings) + _, ext = os.path.splitext(filename) + ext = ext.lower() + if ext and ext not in ALLOWED_ALL_EXTS: + return OCRResponse( + filename=filename, + mime=mime, + page_count=0, + results=[], + warnings=[f"허용되지 않는 확장자({ext}). 허용: {sorted(ALLOWED_ALL_EXTS)}"], + note_id=None, + text=None, + ) - # 2) EasyOCR - try: - image_np = np.array(image) - easy_results = reader.readtext(image_np) - easy_text = " ".join([res[1] for res in easy_results]) - except Exception: - easy_text = "" - - # 3) TrOCR 4개 모델 - hf_texts: List[str] = [] - try: - for pipe in ( - hf_trocr_printed, - hf_trocr_handwritten, - hf_trocr_small_printed, - hf_trocr_large_printed - ): - out = pipe(image) - if isinstance(out, list) and "generated_text" in out[0]: - hf_texts.append(out[0]["generated_text"].strip()) - except Exception: - pass - - # 4) 가장 긴 결과 선택 - candidates = [t for t in [easy_text] + hf_texts if t and t.strip()] - if not candidates: - raise HTTPException(status_code=500, detail="텍스트를 인식할 수 없습니다.") - ocr_text = max(candidates, key=len) - - # 5) Note 생성 - try: - new_note = NoteModel( - user_id=current_user.u_id, - folder_id=folder_id, - title="OCR 결과", - content=ocr_text + # 타입 판별 (보조적으로 unknown 방지) + ftype = detect_type(filename, mime) + if ftype == "unknown": + return OCRResponse( + filename=filename, + mime=mime, + page_count=0, + results=[], + warnings=["지원되지 않는 파일 형식입니다."], + note_id=None, + text=None, ) - db.add(new_note) - db.commit() - db.refresh(new_note) - except Exception as e: - raise HTTPException(status_code=500, detail=f"노트 저장 실패: {e}") - return { - "note_id": new_note.id, - "text": ocr_text - } + data = await upload.read() + + pipe = run_pipeline( + filename=filename, + mime=mime, + data=data, + langs=langs, + max_pages=max_pages, + ) + + merged_text = "\n\n".join([ + item.get("text", "") for item in (pipe.get("results") or []) if item.get("text") + ]).strip() + + note_id: Optional[int] = None + if merged_text: + try: + new_note = NoteModel( + user_id=current_user.u_id, + folder_id=folder_id, + title="OCR 결과", + content=merged_text, + ) + db.add(new_note) + db.commit() + db.refresh(new_note) + note_id = new_note.id + except Exception as e: + (pipe.setdefault("warnings", [])).append(f"노트 저장 실패: {e}") + + pipe["note_id"] = note_id + pipe["text"] = merged_text or None + + return pipe @router.post("/audio") @@ -338,3 +362,10 @@ async def upload_audio_and_transcribe( "message": "STT 및 노트 저장 완료", "transcript": transcript } +@router.options("/ocr") +def ocr_cors_preflight() -> Response: + """CORS preflight용 OPTIONS 응답. 일부 프록시/클라이언트에서 405 회피. + 변경 전: 별도 OPTIONS 라우트 없음(미들웨어에 의존) + 변경 후(추가): 명시적으로 200을 반환 + """ + return Response(status_code=200) diff --git a/schemas/file.py b/schemas/file.py new file mode 100644 index 0000000..e329d89 --- /dev/null +++ b/schemas/file.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +from typing import List, Optional +from pydantic import BaseModel, Field + + +class OCRResultItem(BaseModel): + page: int + text: str + + +class OCRResponse(BaseModel): + # 신규 필드(추가/변경): 공통 파이프라인 메타 + filename: str + mime: Optional[str] = None + page_count: int + results: List[OCRResultItem] = Field(default_factory=list) + warnings: List[str] = Field(default_factory=list) + + # 하위 호환(변경 전 응답 유지): 기존 이미지 OCR 응답 + note_id: Optional[int] = None + text: Optional[str] = None + diff --git a/utils/ocr/__init__.py b/utils/ocr/__init__.py new file mode 100644 index 0000000..5db369a --- /dev/null +++ b/utils/ocr/__init__.py @@ -0,0 +1,15 @@ +""" +utils.ocr 패키지 + +추가/변경 요약 +- 공통 OCR 파이프라인 진입점(run_pipeline)을 외부에 노출 +- 이미지/PDF/DOC/DOCX/HWP를 단일 인터페이스로 처리 +""" + +from .ocr_core import run_pipeline, detect_type + +__all__ = [ + "run_pipeline", + "detect_type", +] + diff --git a/utils/ocr/converters.py b/utils/ocr/converters.py new file mode 100644 index 0000000..7455d16 --- /dev/null +++ b/utils/ocr/converters.py @@ -0,0 +1,88 @@ +""" +utils/ocr/converters.py + +추가/변경 +- PDF/DOC/DOCX/HWP를 파이프라인에서 재사용할 수 있도록 변환/추출 유틸 제공 +- 외부 의존(soffice, hwp5txt)이 없을 수 있으므로 항상 예외를 던지지 말고 상위에서 warnings에 기록 +""" + +from __future__ import annotations + +import os +import shutil +import subprocess +import tempfile +from typing import List + +from PIL import Image + + +def save_bytes_to_temp(data: bytes, suffix: str = "") -> str: + """바이트를 임시 파일로 저장하고 경로를 반환. + 호출자가 삭제를 책임짐. + """ + fd, path = tempfile.mkstemp(suffix=suffix) + with os.fdopen(fd, "wb") as f: + f.write(data) + return path + + +def pdf_to_images(pdf_path: str, dpi: int = 200) -> List[Image.Image]: + """pdf2image.convert_from_path로 PDF를 PIL 이미지 리스트로 변환. + 주: 시스템에 poppler가 필요할 수 있음. + """ + from pdf2image import convert_from_path # 지연 임포트 + images = convert_from_path(pdf_path, dpi=dpi) + return images + + +def office_to_pdf(input_path: str, outdir: str) -> str: + """LibreOffice(soffice)를 사용하여 DOC/DOCX를 PDF로 변환. + 반환: 변환된 PDF 경로 + 실패 시 예외 발생(상위에서 warnings 처리) + """ + soffice = shutil.which("soffice") or shutil.which("libreoffice") + if not soffice: + raise RuntimeError("LibreOffice(soffice) 실행 파일을 찾을 수 없습니다.") + + cmd = [ + soffice, + "--headless", + "--convert-to", + "pdf", + "--outdir", + outdir, + input_path, + ] + proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if proc.returncode != 0: + raise RuntimeError( + f"LibreOffice 변환 실패: {proc.stderr.decode(errors='ignore')[:300]}" + ) + + base = os.path.splitext(os.path.basename(input_path))[0] + pdf_path = os.path.join(outdir, f"{base}.pdf") + if not os.path.exists(pdf_path): + # 일부 환경에서 출력 파일명이 다르게 생성될 수 있어 재탐색 + candidates = [p for p in os.listdir(outdir) if p.lower().endswith(".pdf")] + if candidates: + pdf_path = os.path.join(outdir, candidates[0]) + if not os.path.exists(pdf_path): + raise RuntimeError("PDF 결과 파일이 생성되지 않았습니다.") + return pdf_path + + +def hwp_to_text(input_path: str) -> str: + """hwp5txt(또는 pyhwp)로 HWP 텍스트를 추출. + 주: hwp5txt CLI가 설치되어 있어야 함. 없으면 예외. + """ + hwp5txt = shutil.which("hwp5txt") + if not hwp5txt: + raise RuntimeError("hwp5txt 실행 파일을 찾을 수 없습니다.") + proc = subprocess.run([hwp5txt, input_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if proc.returncode != 0: + raise RuntimeError( + f"hwp5txt 추출 실패: {proc.stderr.decode(errors='ignore')[:300]}" + ) + return proc.stdout.decode(errors="ignore") + diff --git a/utils/ocr/ocr_core.py b/utils/ocr/ocr_core.py new file mode 100644 index 0000000..c15a315 --- /dev/null +++ b/utils/ocr/ocr_core.py @@ -0,0 +1,324 @@ +""" +utils/ocr/ocr_core.py + +추가/변경 +- 파일 타입 판별(확장자 우선, MIME 보조) 및 통합 OCR 파이프라인(run_pipeline) 구현 +- 이미지: pytesseract 기본 OCR, (기존) EasyOCR/TrOCR는 가능 시 보조로 시도하여 최적 텍스트 선택 +- PDF: pdf2image(convert_from_path, dpi=200)로 페이지 이미지를 생성하여 페이지별 OCR +- DOC/DOCX: LibreOffice(soffice --headless)로 PDF로 변환 후 PDF 파이프라인 재사용 +- HWP: hwp5txt로 텍스트 추출(성공 시 page=1로 results에 추가), 실패 시 warnings 기록 +- 대용량 제어: MAX_PAGES(기본 50)까지 처리하고 잘린 경우 warnings 기록 +- 예외는 raise하지 않고 results=[], warnings로 사유를 담아 상위가 200으로 응답할 수 있게 함 +""" + +from __future__ import annotations + +import io +import os +from typing import Dict, List, Optional, Tuple + +from PIL import Image + +from .converters import ( + save_bytes_to_temp, + pdf_to_images, + office_to_pdf, + hwp_to_text, +) + + +# 지원 확장자 세트 +IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp"} +PDF_EXTS = {".pdf"} +DOC_EXTS = {".doc", ".docx"} +HWP_EXTS = {".hwp"} + + +def detect_type(filename: str, content_type: Optional[str]) -> str: + """확장자 기반 타입 판별, MIME은 보조. + 반환: "image" | "pdf" | "docx" | "hwp" | "unknown" + """ + ext = os.path.splitext(filename or "")[1].lower() + if ext in IMAGE_EXTS: + return "image" + if ext in PDF_EXTS: + return "pdf" + if ext in DOC_EXTS: + return "docx" # 내부적으로 DOC/DOCX를 동일 경로로 처리 + if ext in HWP_EXTS: + return "hwp" + + # MIME 보조 판단(간단히) + if content_type: + ct = content_type.lower() + if ct.startswith("image/"): + return "image" + if ct == "application/pdf": + return "pdf" + if ct in ("application/msword", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"): + return "docx" + if "hwp" in ct: + return "hwp" + return "unknown" + + +def _ocr_image_pytesseract(img: Image.Image, langs: str, warnings: List[str]) -> str: + """pytesseract를 사용하여 이미지에서 텍스트 추출. + 주: 시스템에 tesseract OCR 엔진 및 언어 데이터가 설치되어 있어야 함. + """ + try: + import pytesseract + text = pytesseract.image_to_string(img, lang=langs) + return text.strip() + except Exception as e: + warnings.append(f"pytesseract OCR 실패: {e}") + return "" + + +def _ocr_image_legacy(img: Image.Image, warnings: List[str]) -> str: + """기존 이미지 OCR(EasyOCR + TrOCR) 로직 재사용. + - 환경/의존성에 따라 실패할 수 있으므로 예외는 warnings에만 기록. + - 기존 구현과 동일하게 가장 긴 텍스트를 선택. + """ + try: + import numpy as np + import easyocr + from transformers import pipeline + except Exception as e: + warnings.append(f"기존 OCR 모듈(EasyOCR/TrOCR) 사용 불가: {e}") + return "" + + try: + # EasyOCR + reader = easyocr.Reader(["ko", "en"], gpu=False) + image_np = np.array(img.convert("RGB")) + easy_results = reader.readtext(image_np) + easy_text = " ".join([res[1] for res in easy_results]) + except Exception as e: + warnings.append(f"EasyOCR 실패: {e}") + easy_text = "" + + hf_texts: List[str] = [] + try: + for model_name in ( + "microsoft/trocr-base-printed", + "microsoft/trocr-base-handwritten", + "microsoft/trocr-small-printed", + "microsoft/trocr-large-printed", + ): + try: + pipe = pipeline("image-to-text", model=model_name, trust_remote_code=True) + out = pipe(img) + if isinstance(out, list) and out and isinstance(out[0], dict) and "generated_text" in out[0]: + hf_texts.append(out[0]["generated_text"].strip()) + except Exception as e: + warnings.append(f"TrOCR({model_name}) 실패: {e}") + except Exception as e: + warnings.append(f"TrOCR 파이프라인 초기화 실패: {e}") + + candidates = [t for t in [easy_text] + hf_texts if t and t.strip()] + if not candidates: + return "" + return max(candidates, key=len) + + +def _ocr_image_best(img: Image.Image, langs: str, warnings: List[str]) -> str: + """ + 변경(모델 우선): 기존(EasyOCR/TrOCR) → pytesseract 순으로 시도하고 더 긴 텍스트 선택. + - 서버에 Tesseract가 없어도 동작하도록 모델 기반 경로를 우선. + """ + legacy_text = _ocr_image_legacy(img, warnings) + tesseract_text = _ocr_image_pytesseract(img, langs, warnings) + + candidates = [t for t in [legacy_text, tesseract_text] if t] + if not candidates: + return "" + return max(candidates, key=len) + + +def run_pipeline( + filename: str, + mime: Optional[str], + data: bytes, + langs: str = "kor+eng", + max_pages: int = 50, +) -> Dict: + """공통 OCR 파이프라인 + + 반환 JSON 스키마: + { + "filename": str, + "mime": str | null, + "page_count": int, + "results": [{"page": int, "text": str}], + "warnings": [str] + } + + 예외는 raise하지 않고 warnings에만 기록 후 results를 비워서 반환. + """ + warnings: List[str] = [] + results: List[Dict] = [] + page_count = 0 + + ftype = detect_type(filename, mime) + + try: + if ftype == "image": + # 단일 이미지 → 페이지 1로 간주 + try: + img = Image.open(io.BytesIO(data)).convert("RGB") + except Exception as e: + warnings.append(f"이미지 열기 실패: {e}") + img = None + + if img is not None: + text = _ocr_image_best(img, langs, warnings) + page_count = 1 + results.append({"page": 1, "text": text or ""}) + + elif ftype == "pdf": + # 변경: PyMuPDF(fitz) 우선 사용 → 네이티브 텍스트, 없으면 렌더링 후 모델 OCR + images: List[Image.Image] = [] + try: + import fitz # PyMuPDF + doc = fitz.open(stream=data, filetype="pdf") + total = doc.page_count + if total > max_pages: + warnings.append(f"페이지가 {max_pages}장을 초과하여 앞 {max_pages}페이지만 처리합니다.") + limit = min(total, max_pages) + for i in range(limit): + page = doc.load_page(i) + txt = (page.get_text("text") or "").strip() + if txt: + results.append({"page": i + 1, "text": txt}) + else: + # 이미지 렌더링 후 모델 OCR + try: + mat = fitz.Matrix(2, 2) # ~144 DPI 정도 + pix = page.get_pixmap(matrix=mat) + mode = "RGBA" if pix.alpha else "RGB" + img = Image.frombytes(mode, [pix.width, pix.height], pix.samples) + if mode == "RGBA": + img = img.convert("RGB") + images.append(img) + except Exception as e: + warnings.append(f"PDF 페이지 렌더링 실패(page {i+1}): {e}") + page_count = limit + except Exception as e: + warnings.append(f"PyMuPDF 처리 실패: {e}") + # 대체 경로: pdf2image(poppler 필요) + pdf_path = save_bytes_to_temp(data, suffix=".pdf") + try: + images = pdf_to_images(pdf_path, dpi=200) + except Exception as ee: + warnings.append(f"PDF를 이미지로 변환 실패: {ee}") + images = [] + finally: + try: + os.remove(pdf_path) + except Exception: + pass + total = len(images) + if total > max_pages: + warnings.append(f"페이지가 {max_pages}장을 초과하여 앞 {max_pages}페이지만 처리합니다.") + images = images[:max_pages] + page_count = len(images) + + # 이미지에 대해 모델 OCR 수행 (필요한 페이지만) + for idx, img in enumerate(images, start=1): + text = _ocr_image_best(img, langs, warnings) + results.append({"page": idx, "text": text or ""}) + + elif ftype == "docx": + # 변경: .docx는 python-docx로 네이티브 텍스트 추출 우선, .doc는 LibreOffice 변환 + ext = os.path.splitext(filename or "")[1].lower() + if ext == ".docx": + try: + from docx import Document # python-docx + doc = Document(io.BytesIO(data)) + paras = [] + for p in doc.paragraphs: + if p.text: + paras.append(p.text) + text = "\n".join(paras).strip() + if text: + results.append({"page": 1, "text": text}) + page_count = 1 + else: + warnings.append("DOCX에서 추출된 텍스트가 없습니다.") + except Exception as e: + warnings.append(f"python-docx 처리 실패: {e}") + else: + # 구형 .doc → LibreOffice로 PDF 변환 후 OCR + in_path = save_bytes_to_temp(data, suffix=ext or ".doc") + outdir = os.path.dirname(in_path) + pdf_path: Optional[str] = None + try: + pdf_path = office_to_pdf(in_path, outdir) + # PDF 처리 동일 (PyMuPDF 경로 우선) + try: + import fitz + doc = fitz.open(pdf_path) + total = doc.page_count + if total > max_pages: + warnings.append(f"페이지가 {max_pages}장을 초과하여 앞 {max_pages}페이지만 처리합니다.") + limit = min(total, max_pages) + for i in range(limit): + page = doc.load_page(i) + txt = (page.get_text("text") or "").strip() + if txt: + results.append({"page": i + 1, "text": txt}) + else: + mat = fitz.Matrix(2, 2) + pix = page.get_pixmap(matrix=mat) + mode = "RGBA" if pix.alpha else "RGB" + img = Image.frombytes(mode, [pix.width, pix.height], pix.samples) + if mode == "RGBA": + img = img.convert("RGB") + t = _ocr_image_best(img, langs, warnings) + results.append({"page": i + 1, "text": t or ""}) + page_count = limit + except Exception as e: + warnings.append(f"DOC→PDF 처리 후 읽기 실패: {e}") + except Exception as e: + warnings.append(f"DOC 변환 실패: {e}") + finally: + try: + os.remove(in_path) + except Exception: + pass + if pdf_path: + try: + os.remove(pdf_path) + except Exception: + pass + + elif ftype == "hwp": + # HWP → hwp5txt 1차 시도. 성공 시 page=1 + in_path = save_bytes_to_temp(data, suffix=".hwp") + try: + text = hwp_to_text(in_path) + results.append({"page": 1, "text": (text or "").strip()}) + page_count = 1 + except Exception as e: + warnings.append(f"HWP 텍스트 추출 실패: {e}") + finally: + try: + os.remove(in_path) + except Exception: + pass + + else: + warnings.append("지원되지 않는 파일 형식입니다.") + + except Exception as e: + # 상위에서 200으로 내려줄 수 있도록 전체 예외 흡수 + warnings.append(f"파이프라인 실행 오류: {e}") + + return { + "filename": filename, + "mime": mime, + "page_count": page_count, + "results": results, + "warnings": warnings, + } From 8cc2056b665a513469714f698c623526bc85b42c Mon Sep 17 00:00:00 2001 From: BcKmini Date: Sun, 14 Sep 2025 03:49:19 +0900 Subject: [PATCH 2/2] 0914 03:49 --- .github/workflows/ci.yml | 41 +++++++++++++++++++++++++++ .github/workflows/docker.yml | 35 +++++++++++++++++++++++ Dockerfile | 20 +++++++++++++ README.md | 55 ++++++++++++++++++++++++++++++++++++ routers/file.py | 4 ++- 5 files changed, 154 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/docker.yml create mode 100644 Dockerfile create mode 100644 README.md diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..bbc00c9 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,41 @@ +name: CI + +on: + pull_request: + branches: [ main ] + push: + branches: [ main ] + paths: + - '**/*.py' + - 'requirements.txt' + - '.github/workflows/ci.yml' + +jobs: + lint-test: + runs-on: ubuntu-latest + timeout-minutes: 10 + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: 'pip' + + - name: Install deps + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Syntax check + run: | + python -m py_compile $(git ls-files '*.py' | tr '\n' ' ') + + - name: Import smoke + run: | + python - << 'PY' + from importlib import import_module + import_module('main') + print('Import OK') + PY + diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml new file mode 100644 index 0000000..1c8c292 --- /dev/null +++ b/.github/workflows/docker.yml @@ -0,0 +1,35 @@ +name: Docker Build & Push (Backend) + +on: + push: + branches: [ main ] + paths: + - '**' + - '!README.md' + +jobs: + docker: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + steps: + - uses: actions/checkout@v4 + + - name: Log in to GHCR + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push + uses: docker/build-push-action@v6 + with: + context: . + file: ./Dockerfile + push: true + tags: | + ghcr.io/${{ github.repository }}:backend-latest + ghcr.io/${{ github.repository }}:backend-${{ github.sha }} + diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..99b6d62 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,20 @@ +FROM python:3.11-slim + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 + +WORKDIR /app + +# System dependencies for pdf2image +RUN apt-get update && apt-get install -y --no-install-recommends \ + poppler-utils \ + && rm -rf /var/lib/apt/lists/* + +COPY Backend/requirements.txt ./requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + +COPY Backend/ ./ + +EXPOSE 8080 +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"] + diff --git a/README.md b/README.md new file mode 100644 index 0000000..c1a4caf --- /dev/null +++ b/README.md @@ -0,0 +1,55 @@ +# Noteflow Backend (FastAPI) + +## Overview +- FastAPI backend for Noteflow +- OCR pipeline supports images, PDF, DOC/DOCX, HWP (via utilities and system tools) + +## Run (local) +``` +python -m venv .venv +source .venv/bin/activate +pip install -r requirements.txt +uvicorn main:app --host 0.0.0.0 --port 8080 --reload +``` + +Env (optional): +- `SECRET_KEY`, `ACCESS_TOKEN_EXPIRE_MINUTES` +- Database URLs if you connect a DB (current code uses provided models) + +## OCR system tools (optional but recommended) +- PyMuPDF (Python) used by default for PDF text extraction +- Optional fallbacks/tools: + - Poppler (`pdftoppm`) for `pdf2image` + - LibreOffice (`soffice`) for .doc → .pdf + - `hwp5txt` for .hwp text extraction +- If missing, the API still returns 200 with `warnings` explaining limitations. + +## API Highlights +- `POST /api/v1/files/ocr` — OCR and create note (accepts file + optional `folder_id`, `langs`, `max_pages`) +- `POST /api/v1/files/upload` — Upload files to folder +- `POST /api/v1/files/audio` — STT from audio, create/append to note + +## CI (GitHub Actions) +- This folder includes `.github/workflows/ci.yml` to lint/smoke-test on push/PR. +- Python 3.11, `pip install -r requirements.txt`, syntax check and import smoke. + +## Docker (optional; for later) +- Dockerfile included. Build & run locally: +``` +docker build -t noteflow-backend . +docker run --rm -p 8080:8080 noteflow-backend +``` +- GitHub Actions container build: + - `.github/workflows/docker.yml` pushes to GHCR: + - `ghcr.io//:backend-latest` + - `ghcr.io//:backend-` +- Deployment example (SSH) once you’re ready: +``` +docker login ghcr.io -u -p +docker pull ghcr.io//:backend-latest +docker run -d --name backend --restart=always -p 8080:8080 ghcr.io//:backend-latest +``` + +## Notes +- If you split this folder into its own repository root, the included `.github/workflows/*.yml` files will work as-is. +- OCR uses model-first path (EasyOCR + TrOCR) and falls back to tesseract when available. diff --git a/routers/file.py b/routers/file.py index 539f8db..1a43a25 100644 --- a/routers/file.py +++ b/routers/file.py @@ -268,10 +268,12 @@ async def ocr_and_create_note( note_id: Optional[int] = None if merged_text: try: + # 추가/변경: 노트 제목을 업로드한 파일 이름으로 설정 (확장자 제거) + base_title = os.path.splitext(filename)[0].strip() or "OCR 결과" new_note = NoteModel( user_id=current_user.u_id, folder_id=folder_id, - title="OCR 결과", + title=base_title, content=merged_text, ) db.add(new_note)