#!/usr/bin/env python3
import os
import sys
import json
import logging
from pathlib import Path
import whisper
from PIL import Image
import pytesseract
import PyPDF2
import requests
from typing import List, Dict, Optional
import time
from datetime import datetime

# Configurar logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Configuração da API OpenRouter
OPENROUTER_API_KEY = "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"
OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"

class ContentTranscriber:
    def __init__(self, base_dir):
        self.base_dir = Path(base_dir)
        self.texts_dir = self.base_dir / 'textos'
        self.texts_dir.mkdir(exist_ok=True)
        self.whisper_model = None
        
    def load_whisper_model(self):
        """Carregar modelo Whisper (apenas quando necessário)"""
        if self.whisper_model is None:
            logging.info("Carregando modelo Whisper...")
            self.whisper_model = whisper.load_model("base")
            logging.info("Modelo Whisper carregado!")
    
    def correct_text_with_gemini(self, text: str, context: str = "") -> str:
        """Corrigir texto usando Gemini via OpenRouter"""
        if not text or len(text.strip()) < 10:
            return text
            
        headers = {
            "Authorization": f"Bearer {OPENROUTER_API_KEY}",
            "Content-Type": "application/json",
            "HTTP-Referer": "http://localhost:3000",
            "X-Title": "Campaign Organizer"
        }
        
        prompt = f"""Você é um especialista em correção de textos transcritos. 
        Corrija erros de transcrição, ortografia e gramática no texto abaixo, mantendo o sentido original.
        Contexto: {context}
        
        Texto para corrigir:
        {text}
        
        Retorne APENAS o texto corrigido, sem explicações ou comentários adicionais."""
        
        data = {
            "model": "google/gemini-2.5-flash",
            "messages": [
                {"role": "user", "content": prompt}
            ],
            "temperature": 0.1
        }
        
        try:
            response = requests.post(OPENROUTER_URL, headers=headers, json=data, timeout=30)
            if response.status_code == 200:
                result = response.json()
                corrected_text = result['choices'][0]['message']['content']
                return corrected_text.strip()
            else:
                logging.error(f"Erro na API OpenRouter: {response.status_code} - {response.text}")
                return text
        except Exception as e:
            logging.error(f"Erro ao corrigir texto com Gemini: {e}")
            return text
    
    def transcribe_video(self, video_path: Path) -> Optional[str]:
        """Transcrever vídeo usando Whisper"""
        try:
            self.load_whisper_model()
            logging.info(f"Transcrevendo vídeo: {video_path.name}")
            
            result = self.whisper_model.transcribe(
                str(video_path),
                language="pt",
                task="transcribe"
            )
            
            raw_text = result["text"]
            logging.info(f"Transcrição bruta concluída para: {video_path.name}")
            
            # Corrigir texto com Gemini
            corrected_text = self.correct_text_with_gemini(
                raw_text, 
                context="Transcrição de vídeo de campanha de marketing educacional"
            )
            
            # Salvar transcrição
            output_name = f"transc_video_{video_path.stem}.txt"
            output_path = self.texts_dir / output_name
            
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(f"# Transcrição: {video_path.name}\n\n")
                f.write(corrected_text)
            
            logging.info(f"Transcrição salva em: {output_path}")
            return corrected_text
            
        except Exception as e:
            logging.error(f"Erro ao transcrever vídeo {video_path}: {e}")
            return None
    
    def extract_text_from_image(self, image_path: Path) -> Optional[str]:
        """Extrair texto de imagem usando OCR"""
        try:
            logging.info(f"Extraindo texto da imagem: {image_path.name}")
            
            # Abrir e processar imagem
            image = Image.open(image_path)
            
            # Converter para RGB se necessário
            if image.mode != 'RGB':
                image = image.convert('RGB')
            
            # Extrair texto com Tesseract
            raw_text = pytesseract.image_to_string(image, lang='por')
            
            if not raw_text.strip():
                logging.warning(f"Nenhum texto encontrado em: {image_path.name}")
                return None
            
            logging.info(f"Texto extraído de: {image_path.name}")
            
            # Corrigir texto com Gemini
            corrected_text = self.correct_text_with_gemini(
                raw_text,
                context="Texto extraído de imagem de campanha de marketing"
            )
            
            # Salvar texto extraído
            output_name = f"transc_imagem_{image_path.stem}.txt"
            output_path = self.texts_dir / output_name
            
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(f"# Texto da Imagem: {image_path.name}\n\n")
                f.write(corrected_text)
            
            logging.info(f"Texto salvo em: {output_path}")
            return corrected_text
            
        except Exception as e:
            logging.error(f"Erro ao extrair texto da imagem {image_path}: {e}")
            return None
    
    def extract_text_from_pdf(self, pdf_path: Path) -> Optional[str]:
        """Extrair texto de PDF"""
        try:
            logging.info(f"Extraindo texto do PDF: {pdf_path.name}")
            
            text_parts = []
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                
                for page_num in range(len(pdf_reader.pages)):
                    page = pdf_reader.pages[page_num]
                    page_text = page.extract_text()
                    if page_text.strip():
                        text_parts.append(f"--- Página {page_num + 1} ---\n{page_text}")
            
            if not text_parts:
                logging.warning(f"Nenhum texto encontrado no PDF: {pdf_path.name}")
                return None
            
            raw_text = "\n\n".join(text_parts)
            logging.info(f"Texto extraído do PDF: {pdf_path.name}")
            
            # Corrigir texto com Gemini (processar em chunks se muito grande)
            if len(raw_text) > 10000:
                # Dividir em chunks menores
                chunks = [raw_text[i:i+8000] for i in range(0, len(raw_text), 8000)]
                corrected_parts = []
                for i, chunk in enumerate(chunks):
                    logging.info(f"Corrigindo chunk {i+1}/{len(chunks)} do PDF...")
                    corrected_chunk = self.correct_text_with_gemini(
                        chunk,
                        context="Texto extraído de PDF educacional de marketing"
                    )
                    corrected_parts.append(corrected_chunk)
                    time.sleep(1)  # Rate limiting
                corrected_text = "\n\n".join(corrected_parts)
            else:
                corrected_text = self.correct_text_with_gemini(
                    raw_text,
                    context="Texto extraído de PDF educacional de marketing"
                )
            
            # Salvar texto extraído
            output_name = f"transc_pdf_{pdf_path.stem}.txt"
            output_path = self.texts_dir / output_name
            
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(f"# Texto do PDF: {pdf_path.name}\n\n")
                f.write(corrected_text)
            
            logging.info(f"Texto salvo em: {output_path}")
            return corrected_text
            
        except Exception as e:
            logging.error(f"Erro ao extrair texto do PDF {pdf_path}: {e}")
            return None
    
    def create_markdown_compilation(self):
        """Criar arquivo markdown com todas as transcrições"""
        markdown_path = self.base_dir / 'copy_bank_completo.md'
        
        with open(markdown_path, 'w', encoding='utf-8') as md_file:
            md_file.write("# Copy Bank - Campanha Milla Borges\n\n")
            md_file.write(f"Compilado em: {datetime.now().strftime('%d/%m/%Y às %H:%M')}\n\n")
            md_file.write("---\n\n")
            
            # Adicionar todos os textos do diretório textos
            text_files = sorted(self.texts_dir.glob('*.txt'))
            
            if text_files:
                for text_file in text_files:
                    try:
                        with open(text_file, 'r', encoding='utf-8') as f:
                            content = f.read()
                        
                        # Adicionar ao markdown
                        md_file.write(content)
                        md_file.write("\n\n---\n\n")
                        
                    except Exception as e:
                        logging.error(f"Erro ao ler {text_file}: {e}")
            
            md_file.write("\n## Fim do Copy Bank\n")
        
        logging.info(f"Copy bank completo salvo em: {markdown_path}")
        return markdown_path
    
    def process_all_content(self):
        """Processar todos os arquivos organizados"""
        results = {
            'videos': {'processed': 0, 'errors': 0},
            'images': {'processed': 0, 'errors': 0},
            'pdfs': {'processed': 0, 'errors': 0}
        }
        
        # Processar vídeos
        videos_dir = self.base_dir / 'videos'
        if videos_dir.exists():
            video_files = list(videos_dir.glob('*'))
            logging.info(f"Encontrados {len(video_files)} vídeos para processar")
            
            for video_file in video_files:
                if self.transcribe_video(video_file):
                    results['videos']['processed'] += 1
                else:
                    results['videos']['errors'] += 1
                time.sleep(2)  # Rate limiting
        
        # Processar imagens
        images_dir = self.base_dir / 'imagens'
        if images_dir.exists():
            image_files = list(images_dir.glob('*'))
            logging.info(f"Encontradas {len(image_files)} imagens para processar")
            
            for image_file in image_files:
                if self.extract_text_from_image(image_file):
                    results['images']['processed'] += 1
                else:
                    results['images']['errors'] += 1
                time.sleep(1)  # Rate limiting
        
        # Processar PDFs
        pdfs_dir = self.base_dir / 'pdfs'
        if pdfs_dir.exists():
            pdf_files = list(pdfs_dir.glob('*.pdf'))
            logging.info(f"Encontrados {len(pdf_files)} PDFs para processar")
            
            for pdf_file in pdf_files:
                if self.extract_text_from_pdf(pdf_file):
                    results['pdfs']['processed'] += 1
                else:
                    results['pdfs']['errors'] += 1
                time.sleep(1)  # Rate limiting
        
        # Salvar resumo dos resultados
        summary_path = self.texts_dir / 'processing_summary.json'
        with open(summary_path, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2)
        
        # Criar compilação markdown
        self.create_markdown_compilation()
        
        return results

def main():
    if len(sys.argv) != 2:
        print("Uso: python transcriber.py <diretório>")
        sys.exit(1)
    
    base_dir = sys.argv[1]
    
    if not os.path.exists(base_dir):
        print(f"Erro: Diretório '{base_dir}' não encontrado.")
        sys.exit(1)
    
    # Verificar se Tesseract está instalado
    try:
        pytesseract.get_tesseract_version()
    except pytesseract.TesseractNotFoundError:
        print("ERRO: Tesseract não está instalado!")
        print("Por favor, instale o Tesseract:")
        print("  macOS: brew install tesseract tesseract-lang")
        print("  Ubuntu: sudo apt-get install tesseract-ocr tesseract-ocr-por")
        print("  Windows: Baixe de https://github.com/UB-Mannheim/tesseract/wiki")
        sys.exit(1)
    
    transcriber = ContentTranscriber(base_dir)
    
    print(f"Processando conteúdo em: {base_dir}")
    print("Isso pode demorar alguns minutos...")
    
    results = transcriber.process_all_content()
    
    print("\n=== RESUMO DO PROCESSAMENTO ===")
    print(f"Vídeos: {results['videos']['processed']} processados, {results['videos']['errors']} erros")
    print(f"Imagens: {results['images']['processed']} processados, {results['images']['errors']} erros")
    print(f"PDFs: {results['pdfs']['processed']} processados, {results['pdfs']['errors']} erros")
    print("===============================\n")
    
    print("Processamento concluído!")

if __name__ == "__main__":
    main()