#!/usr/bin/env python3
# /var/www/html/facial_api/scrape_baralho.py
"""
Scraper do 'Baralho do Crime' (SSP-BA)
- baixa fotos para /var/www/html/facial
- salva/atualiza registros na tabela facial.baralho_crime
Use com responsabilidade e atente-se a conformidade legal.
"""

import os
import time
import re
import logging
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
import mysql.connector
from mysql.connector import errorcode

# ---------- CONFIGURAÇÃO ----------
BASE_PAGE = "https://disquedenuncia.ssp.ba.gov.br/baralho-do-crime/"
DOWNLOAD_DIR = "/var/www/html/facial"
DB_CONFIG = {
    "user": "root",
    "password": "152535ff",
    "host": "127.0.0.1",
    "database": "facial",
    "raise_on_warnings": True,
    "autocommit": True,
}
USER_AGENT = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
              "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36")
REQUEST_TIMEOUT = 20
SLEEP_BETWEEN = 1.2  # throttle between requests

# ---------- LOG ----------
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s [%(levelname)s] %(message)s",
                    handlers=[logging.FileHandler("/var/log/scrape_baralho.log"),
                              logging.StreamHandler()])

# ---------- HELPERS ----------
def safe_filename(s):
    s = re.sub(r'[^A-Za-z0-9_\-\.]', '_', s)
    return s[:200]

def ensure_dir(path):
    if not os.path.exists(path):
        os.makedirs(path, exist_ok=True)
        os.chown(path, os.getuid(), os.getgid())  # opcional: ajustar permissões conforme ambiente

# ---------- DB ----------
def get_db_conn():
    try:
        cnx = mysql.connector.connect(**DB_CONFIG)
        return cnx
    except mysql.connector.Error as err:
        logging.error("Erro conexão DB: %s", err)
        raise

def upsert_person(conn, data):
    """
    data = dict(nome_completo, apelido, carta, detalhe, foto_path, foto_url, source_url)
    Faz INSERT ON DUPLICATE KEY UPDATE usando foto_url como unicidade.
    """
    sql = """
    INSERT INTO baralho_crime
      (nome_completo, apelido, carta, detalhe, foto_path, foto_url, source_url, atualizado_em)
    VALUES (%(nome_completo)s, %(apelido)s, %(carta)s, %(detalhe)s, %(foto_path)s, %(foto_url)s, %(source_url)s, NOW())
    ON DUPLICATE KEY UPDATE
      nome_completo=VALUES(nome_completo),
      apelido=VALUES(apelido),
      carta=VALUES(carta),
      detalhe=VALUES(detalhe),
      foto_path=VALUES(foto_path),
      source_url=VALUES(source_url),
      atualizado_em=NOW()
    """
    cur = conn.cursor()
    cur.execute(sql, data)
    cur.close()

# ---------- SCRAPE ----------
def fetch(url):
    headers = {"User-Agent": USER_AGENT, "Accept-Language": "pt-BR,pt;q=0.9"}
    try:
        r = requests.get(url, headers=headers, timeout=REQUEST_TIMEOUT)
        r.raise_for_status()
        return r.text
    except Exception as e:
        logging.error("Erro fetch %s -> %s", url, e)
        return None

def download_image(img_url, dest_dir, prefix=""):
    ensure_dir(dest_dir)
    headers = {"User-Agent": USER_AGENT}
    try:
        # HEAD first
        h = requests.head(img_url, headers=headers, allow_redirects=True, timeout=10)
        if h.status_code != 200:
            logging.warning("HEAD não OK %s -> %s", img_url, h.status_code)
        ext = os.path.splitext(urlparse(img_url).path)[1] or ".jpg"
        filename = safe_filename(prefix + "_" + os.path.basename(urlparse(img_url).path))
        path = os.path.join(dest_dir, filename)
        # se já existe, retorna
        if os.path.exists(path) and os.path.getsize(path) > 1024:
            logging.info("Arquivo já existe: %s", path)
            return path
        # GET stream
        r = requests.get(img_url, headers=headers, stream=True, timeout=REQUEST_TIMEOUT)
        r.raise_for_status()
        with open(path, "wb") as f:
            for chunk in r.iter_content(1024 * 16):
                if chunk:
                    f.write(chunk)
        logging.info("Salvo imagem: %s", path)
        return path
    except Exception as e:
        logging.error("Falha ao baixar imagem %s -> %s", img_url, e)
        return None

def parse_and_extract(html, base_url):
    soup = BeautifulSoup(html, "html.parser")
    results = []
    # Estratégia: encontrar os cards / imagens / legendas — o site muda, então buscar padrões comuns
    # procuramos por elementos que contenham <img> + algum texto com nome
    cards = soup.find_all(lambda tag: tag.name in ("article","div") and tag.find("img"))
    if not cards:
        # fallback: todas as imgs dentro do container principal
        imgs = soup.select("img")
        for img in imgs:
            alt = (img.get("alt") or "").strip()
            src = img.get("src") or img.get("data-src")
            if not src:
                continue
            # tentar buscar nome no próximo elemento de texto
            name_text = ""
            parent = img.parent
            # procura por texto em siblings
            for sib in parent.find_all_next(string=True, limit=6):
                if sib.strip():
                    name_text = sib.strip()
                    break
            results.append({"img": src, "name_raw": name_text, "container": str(parent)[:200]})
        return results

    for c in cards:
        img_tag = c.find("img")
        if not img_tag:
            continue
        src = img_tag.get("src") or img_tag.get("data-src") or ""
        alt = (img_tag.get("alt") or "").strip()
        text_candidates = []
        # text inside card
        for t in c.stripped_strings:
            text_candidates.append(t)
        name_raw = " | ".join(text_candidates)[:800]
        results.append({"img": src, "alt": alt, "name_raw": name_raw, "container": str(c)[:200]})
    return results

def normalize_name_and_meta(raw):
    """
    Tenta extrair nome completo, apelido, carta e detalhe a partir do texto cru.
    Heurística simples: nomes em maiúscula ou antes de '-' ou 'Apelido:'
    """
    nome_completo = None
    apelido = None
    carta = None
    detalhe = None

    s = raw or ""
    # busca por padrões comuns
    # ex: "Nome: Fulano de Tal (Apelido) - Dez de Ouros"
    # heurística simples:
    m_cart = re.search(r'([A-Za-zÀ-ú0-9 ]{1,50}de\s+(Ouros|Paus|Copas|Espadas)|Rei|Rainha|Valete|Ás|As)\b', s, re.I)
    if m_cart:
        carta = m_cart.group(0).strip()

    # buscar por strings entre parênteses como apelido
    m_par = re.search(r'\(([^)]+)\)', s)
    if m_par:
        apelido = m_par.group(1).strip()

    # tentar extrair primeira linha como nome
    lines = [ln.strip() for ln in re.split(r'[\n\r|-]+', s) if ln.strip()]
    if lines:
        # heurística: linha com maior ocorrência de espaços e palavras é provavelmente o nome
        candidate = max(lines, key=lambda x: len(x))
        # se candidate tem números ou 'cartas' talvez seja detalhe; tentamos algo melhor:
        if len(candidate.split()) >= 2 and not re.search(r'\d', candidate):
            nome_completo = candidate

    detalhe = s if not nome_completo else ("; ".join([l for l in lines if l != nome_completo])[:800])

    return {"nome_completo": nome_completo, "apelido": apelido, "carta": carta, "detalhe": detalhe}

# ---------- MAIN ----------
def main():
    logging.info("Iniciando scrape do Baralho: %s", BASE_PAGE)
    html = fetch(BASE_PAGE)
    if not html:
        logging.error("Página não retornou HTML. Abortando.")
        return

    items = parse_and_extract(html, BASE_PAGE)
    logging.info("Encontrados ~%d itens candidate", len(items))
    ensure_dir(DOWNLOAD_DIR)

    conn = get_db_conn()

    for it in items:
        src = it.get("img")
        if not src:
            continue
        # normalizar URLs relativas
        src_full = urljoin(BASE_PAGE, src)
        meta = normalize_name_and_meta(it.get("name_raw") or it.get("alt") or "")
        # gerar prefix para filename
        pref = (meta.get("nome_completo") or meta.get("apelido") or "baralho").replace(" ", "_")[:60]
        foto_local = download_image(src_full, DOWNLOAD_DIR, prefix=pref)
        if not foto_local:
            logging.warning("Não baixou imagem: %s", src_full)
            continue
        # preparar dados para DB
        data = {
            "nome_completo": meta.get("nome_completo"),
            "apelido": meta.get("apelido"),
            "carta": meta.get("carta"),
            "detalhe": meta.get("detalhe"),
            "foto_path": foto_local,
            "foto_url": src_full,
            "source_url": BASE_PAGE,
        }
        try:
            upsert_person(conn, data)
            logging.info("Inserido/atualizado: %s", data.get("nome_completo") or data.get("apelido") or src_full)
        except Exception as e:
            logging.error("Erro inserção DB para %s -> %s", src_full, e)
        time.sleep(SLEEP_BETWEEN)

    conn.close()
    logging.info("Fim do processo.")

if __name__ == "__main__":
    main()