# Copyright © 2025 Intellisol LLC. All Rights Reserved.
#
# This file is part of the Intellisol Automation System.
#
# This software is a trade secret of Intellisol LLC. It is proprietary and
# confidential information. You may not disclose this software or any part of it
# to any third party, or use it in any way not expressly authorized by the
# accompanying End-User License Agreement (EULA).
#
# UNPUBLISHED. RIGHTS RESERVED.


# data_extraction_system/tools/ocr_engine.py
from google.cloud import documentai_v1 as documentai
import os

def perform_ocr(document_path: str) -> str:
    """Performs OCR on a document using Google Cloud Document AI and returns the extracted text."""
    project_id = os.getenv("GOOGLE_CLOUD_PROJECT_ID")
    location = os.getenv("DOCUMENT_AI_LOCATION", "us-central1")
    processor_id = os.getenv("DOCUMENT_AI_PROCESSOR_ID")

    if not all([project_id, processor_id]):
        raise ValueError("GOOGLE_CLOUD_PROJECT_ID and DOCUMENT_AI_PROCESSOR_ID must be set.")

    client = documentai.DocumentProcessorServiceClient()
    name = client.processor_path(project_id, location, processor_id)

    with open(document_path, "rb") as image_file:
        content = image_file.read()

    image = documentai.RawDocument(content=content, mime_type="application/pdf") # Adjust mime_type if not PDF

    try:
        result = client.process_document(name=name, raw_document=image)
        document_element = result.document
        return document_element.text
    except Exception as e:
        print(f"Error during OCR processing: {e}")
        return ""
