#!/usr/bin/env python3
"""
rdf_generation_skill.py (Multi-Provider Edition)

Generate comprehensive RDF (Turtle or JSON-LD) from URLs or local files using
native Python libraries and ANY LLM provider (OpenAI, Claude, Gemini, Grok, 
Mistral, Ollama, LM Studio, or custom endpoints), then upload to SPARQL endpoint 
(Virtuoso or any SPARQL 1.1 compliant endpoint) and/or save to local files.

SUPPORTED LLM PROVIDERS
-----------------------
✅ OpenAI              - gpt-4o-mini, gpt-4o, gpt-4-turbo (OPENAI_API_KEY)
✅ Anthropic Claude    - claude-3-5-sonnet, claude-3-opus (ANTHROPIC_API_KEY)
✅ Google Gemini       - gemini-2.0-flash, gemini-1.5-pro (GOOGLE_API_KEY)
✅ xAI Grok            - grok-2, grok-beta (XAI_API_KEY)
✅ Mistral             - mistral-large-latest, mistral-small (MISTRAL_API_KEY)
✅ Ollama (Local)      - llama2, mistral, neural-chat (no auth, free)
✅ LM Studio (Local)   - Any loaded model (no auth, free)
✅ Custom Endpoint     - Any OpenAI-compatible API (custom endpoint)

PREREQUISITES
-------------
1. Install dependencies:
   pip install requests beautifulsoup4 rdflib pypdf pdfplumber \\
               python-docx python-pptx pandas openpyxl

2. Set API key for your chosen provider:
   # OpenAI (default)
   export OPENAI_API_KEY="sk-..."
   
   # Anthropic Claude
   export ANTHROPIC_API_KEY="sk-ant-..."
   
   # Google Gemini
   export GOOGLE_API_KEY="AIzaSy..."
   
   # xAI Grok
   export XAI_API_KEY="xai-..."
   
   # Mistral
   export MISTRAL_API_KEY="..."
   
   # Local (Ollama/LM Studio - no key needed)

3. For SPARQL upload modes, ensure SPARQL endpoint is accessible.

BASIC USAGE
-----------
# Generate with OpenAI (default):
python rdf_generation_skill.py --source https://example.com

# Generate with Claude:
python rdf_generation_skill.py --source document.pdf --llm-provider claude

# Generate with local Ollama (FREE):
python rdf_generation_skill.py --source document.pdf --llm-provider ollama

# Generate with Gemini:
python rdf_generation_skill.py --source document.pdf --llm-provider gemini

# Generate with Grok (cost-effective):
python rdf_generation_skill.py --source document.pdf --llm-provider grok

# List all supported providers:
python rdf_generation_skill.py --list-providers

OUTPUT MODES
------------
Control where the generated RDF goes using --output-mode:

  sparql  - Upload to SPARQL endpoint (default)
  file    - Save to local file only (no upload)
  both    - Upload to SPARQL endpoint AND save to local file

VIRTUOSO CONFIGURATION
----------------------
For sparql/both output modes, configure the SPARQL endpoint:

python rdf_generation_skill.py --source https://example.com \\
    --sparql-endpoint http://localhost:8890/sparql \\
    --graph-iri urn:my:graph \\
    --user dba \\
    --password dba \\
    --verify

SUPPORTED INPUT FORMATS
-----------------------
URLs:        HTML web pages (text extracted via BeautifulSoup)
PDF:         .pdf (pypdf, pdfplumber fallback)
Word:        .docx, .doc (python-docx)
Excel:       .xlsx, .xls (pandas, openpyxl)
PowerPoint:  .pptx, .ppt (python-pptx)
CSV:         .csv (pandas)
Markdown:    .md, .markdown
HTML:        .html, .htm
Text:        .txt
"""

import os
import sys
import re
import json
import argparse
import requests
from pathlib import Path
from typing import Optional, Tuple, List
from urllib.parse import urlparse
from datetime import datetime

# OpenAI API (native, no LlamaIndex)
# Direct API calls using requests (no external openai library needed)

# For RDF parsing and conversion
try:
    from rdflib import Graph
    HAS_RDFLIB = True
except ImportError:
    HAS_RDFLIB = False

# For better HTML text extraction from URLs
try:
    from bs4 import BeautifulSoup
    HAS_BS4 = True
except ImportError:
    HAS_BS4 = False

# PDF readers
try:
    import pypdf
    HAS_PYPDF = True
except ImportError:
    HAS_PYPDF = False

try:
    import pdfplumber
    HAS_PDFPLUMBER = True
except ImportError:
    HAS_PDFPLUMBER = False

# Word document reader
try:
    from docx import Document as DocxDocument
    HAS_DOCX = True
except ImportError:
    HAS_DOCX = False

# PowerPoint reader
try:
    from pptx import Presentation
    HAS_PPTX = True
except ImportError:
    HAS_PPTX = False

# Excel reader
try:
    import pandas as pd
    HAS_PANDAS = True
except ImportError:
    HAS_PANDAS = False

# ============================================================================
# LLM PROVIDER CONFIGURATION
# ============================================================================
# Supports multiple LLM providers with OpenAI-compatible or native APIs

LLM_PROVIDERS = {
    "openai": {
        "endpoint": "https://api.openai.com/v1/chat/completions",
        "api_key_env": "OPENAI_API_KEY",
        "default_model": "gpt-4o-mini",
        "name": "OpenAI",
        "api_type": "openai",
        "max_tokens_param": "max_completion_tokens"  # OpenAI uses newer param name
    },
    "claude": {
        "endpoint": "https://api.anthropic.com/v1/messages",
        "api_key_env": "ANTHROPIC_API_KEY",
        "default_model": "claude-sonnet-4-20250514",
        "name": "Anthropic Claude",
        "api_type": "anthropic",
        "max_tokens_param": "max_tokens"
    },
    "gemini": {
        "endpoint": "https://generativelanguage.googleapis.com/v1beta/chat/completions",
        "api_key_env": "GOOGLE_API_KEY",
        "default_model": "gemini-3-flash-preview",
        "name": "Google Gemini",
        "api_type": "openai_compat",
        # Uses Bearer token auth
        "max_tokens_param": "max_tokens"
    },
    "grok": {
        "endpoint": "https://api.x.ai/v1/chat/completions",
        "api_key_env": "XAI_API_KEY",
        "default_model": "grok-2",
        "name": "xAI Grok",
        "api_type": "openai_compat",
        "max_tokens_param": "max_tokens"
    },
    "mistral": {
        "endpoint": "https://api.mistral.ai/v1/chat/completions",
        "api_key_env": "MISTRAL_API_KEY",
        "default_model": "mistral-large-latest",
        "name": "Mistral",
        "api_type": "openai_compat",
        "max_tokens_param": "max_tokens"
    },
    "ollama": {
        "endpoint": "http://localhost:11434/v1/chat/completions",
        "api_key_env": None,
        "default_model": "llama2",
        "name": "Ollama (local)",
        "api_type": "openai_compat",
        "no_auth": True,
        "max_tokens_param": "max_tokens"
    },
    "lmstudio": {
        "endpoint": "http://localhost:1234/v1/chat/completions",
        "api_key_env": None,
        "default_model": "local-model",
        "name": "LM Studio (local)",
        "api_type": "openai_compat",
        "no_auth": True,
        "max_tokens_param": "max_tokens"
    },
    "custom": {
        "endpoint": None,
        "api_key_env": "LLM_API_KEY",
        "default_model": "model",
        "name": "Custom OpenAI-compatible endpoint",
        "api_type": "openai_compat",
        "max_tokens_param": "max_tokens"
    }
}

# --------------------------
# Default Configuration
# --------------------------
DEFAULT_SOURCE = "https://virtuoso.openlinksw.com"
DEFAULT_GRAPH_IRI = "urn:kidehen:graphrag:test"
DEFAULT_FORMAT = "turtle"
DEFAULT_OUTPUT_MODE = "sparql"
DEFAULT_SPARQL_ENDPOINT = "http://localhost:8890/sparql"
DEFAULT_USER = "dba"
DEFAULT_PASSWORD = "dba"
DEFAULT_LLM_PROVIDER = "openai"

# Global debug flag
DEBUG = False

# --------------------------
# File Extension Mapping
# --------------------------
EXTENSION_MAP = {
    '.pdf': 'pdf',
    '.docx': 'docx',
    '.doc': 'docx',
    '.xlsx': 'excel',
    '.xls': 'excel',
    '.pptx': 'pptx',
    '.ppt': 'pptx',
    '.csv': 'csv',
    '.md': 'markdown',
    '.markdown': 'markdown',
    '.html': 'html',
    '.htm': 'html',
    '.txt': 'text',
}

# --------------------------
# Output Format Extensions
# --------------------------
FORMAT_EXTENSIONS = {
    'turtle': '.ttl',
    'jsonld': '.jsonld',
}

# --------------------------
# Default Prompt Templates
# --------------------------
DEFAULT_PROMPT_GUIDELINES = """You MUST use <{source_uri}> for the base URI, which is then used in deriving relative hash-based hyperlinks that denote subjects and objects. This rule doesn't apply to entities that are already denoted by hyperlinks (e.g., DBpedia, Wikidata, Wikipedia, etc). Note the following guidelines:

1. Use appropriate vocabulary prefixes/context.
2. If applicable, include at least 10 Questions and associated Answers.
3. Utilize annotation properties to enhance the representations of Questions, Answers, Defined Term Set, HowTos, and HowToSteps, if they are included in the response and associate them with article sections (if they exist) or article using schema:hasPart.
4. Where relevant, add attributes for about, abstract, article body, and article section limited to a maximum of 30 words.
5. Denote values of about using hash-based IRIs derived from entity home page or wikipedia page url.
6. Where possible, if confident, add a DBpedia IRI to the list of about attribute values and then connect the list using owl:sameAs; note, never use schema:sameAs in this regard. In addition, never assign literal values to this attribute i.e., they MUST be IRIs.
7. Where relevant, add article sections and fleshed out body comprising no more than 20 words.
8. Where possible, align images with relevant article and howto step sections.
9. Add a label to each how-to step.
10. Add descriptions of any other relevant entity types.
11. Whenever you encounter inline double quotes within the value of an annotation attribute, change the inline double quotes to single quotes.
12. Whenever you encounter video, handle using the VideoObject type, specifying properties such as name, description, thumbnailUrl, uploadDate, contentUrl, and embedUrl -- don't guess and insert non-existent information.
13. Whenever you encounter audio, handle using the AudioObject type, specifying properties such as name, description, thumbnailUrl, uploadDate, contentUrl, and embedUrl -- don't guess and insert non-existent information.
14. Where relevant, include additional entity types, when discovered e.g., Product, Offer, and Service etc.
15. Language tag the values of annotation attributes, apply properly according to syntax rules.
16. Describe article authors and publishers in detail.
17. Use a relatedLink attribute to comprehensively handle all inline urls. Unless told otherwise, it should be a maximum of 20 relevant links.
18. You MUST ensure smart quotes are replaced with single quotes.
19. You MUST check and fix any syntax errors in your output."""

DEFAULT_TURTLE_PROMPT = """Generate a comprehensive representation of this information in valid RDF Turtle syntax using terms from <http://schema.org/>.

{guidelines}

TURTLE-SPECIFIC REQUIREMENTS:
- Start with @base <{source_uri}> .
- Include @prefix declarations for schema:, owl:, xsd:, rdfs:, and any other needed prefixes.
- Use proper Turtle syntax with correct punctuation (periods, semicolons, commas).
- Triple quote literal values containing more than 20 words using \"\"\".
- Output ONLY valid Turtle syntax (no prose, no markdown code blocks, no explanation).

DOCUMENT CONTENT:
\"\"\"
{document_text}
\"\"\"

Following your initial response, perform the following tasks:
1. Check and fix any syntax errors in the response.
2. Provide a list of additional questions, defined terms, or howtos for my approval.
3. Provide a list of additional entity types that could be described for my approval.
4. If the suggested additional entity types are approved, you MUST then return a revised final description comprising the original and added entity descriptions.

Generate the Turtle RDF now:"""

DEFAULT_JSONLD_PROMPT = """Using a code block, generate a comprehensive representation of this information in JSON-LD using valid terms from <http://schema.org/>.

{guidelines}

JSON-LD SPECIFIC REQUIREMENTS:
- Use @base set to {source_uri} for deriving relative hash-based hyperlinks.
- Use @vocab appropriately in @context.
- Expand @context to include all needed prefixes (schema, owl, xsd, rdfs, etc.).
- Ensure proper @id designation for IRI values of attributes that only accept IRI values (e.g., schema:sameAs, owl:sameAs, etc.).
- Output ONLY valid JSON-LD syntax (no prose outside code blocks, no explanation before the JSON).

DOCUMENT CONTENT:
\"\"\"
{document_text}
\"\"\"

Following your initial response, perform the following tasks:
1. Check and fix any syntax errors in the response.
2. Provide a list of additional questions, defined terms, or howtos for my approval.
3. Provide a list of additional entity types that could be described for my approval.
4. If the suggested additional entity types are approved, you MUST then return a revised final description comprising the original and added entity descriptions.

Generate the JSON-LD now:"""


def debug_print(message: str):
    """Print debug message if debug mode is enabled."""
    if DEBUG:
        print(f"🔍 DEBUG: {message}")


def is_url(source: str) -> bool:
    """Check if the source is a URL."""
    try:
        result = urlparse(source)
        return result.scheme in ('http', 'https')
    except:
        return False


def get_file_type(file_path: str) -> str:
    """Determine file type from extension."""
    ext = Path(file_path).suffix.lower()
    return EXTENSION_MAP.get(ext, 'unknown')


def fetch_url_content(url: str) -> str:
    """Fetch document from URL and extract text content."""
    response = requests.get(url, timeout=30)
    response.raise_for_status()
    
    content = response.text
    content_type = response.headers.get("Content-Type", "")
    
    # Extract text from HTML if applicable
    if "html" in content_type.lower() or content.strip().startswith("<!") or content.strip().startswith("<html"):
        if HAS_BS4:
            soup = BeautifulSoup(content, "html.parser")
            # Remove script and style elements
            for element in soup(["script", "style"]):
                element.decompose()
            content = soup.get_text(separator="\n", strip=True)
        else:
            # Basic HTML tag removal fallback
            content = re.sub(r'<[^>]+>', ' ', content)
            content = re.sub(r'\s+', ' ', content).strip()
    
    return content


def fetch_text_content(url: str) -> str:
    """Fetch plain text content from URL (for prompt files)."""
    response = requests.get(url, timeout=30)
    response.raise_for_status()
    return response.text


def load_prompt(prompt_arg: Optional[str], prompt_file_arg: Optional[str], output_format: str) -> str:
    """
    Load prompt from various sources.
    
    Priority:
    1. Inline prompt (--prompt)
    2. Prompt file (--prompt-file) - local or remote
    3. Default prompt based on format
    
    Returns the prompt template with {source_uri} and {document_text} placeholders.
    """
    prompt_template = None
    
    # Priority 1: Inline prompt
    if prompt_arg:
        print("   Using inline prompt")
        prompt_template = prompt_arg
    
    # Priority 2: Prompt file
    elif prompt_file_arg:
        if is_url(prompt_file_arg):
            print(f"   Loading prompt from URL: {prompt_file_arg}")
            try:
                prompt_template = fetch_text_content(prompt_file_arg)
            except Exception as e:
                print(f"⚠️  Failed to load prompt from URL: {e}", file=sys.stderr)
                print("   Falling back to default prompt", file=sys.stderr)
        else:
            # Local file
            if os.path.exists(prompt_file_arg):
                print(f"   Loading prompt from file: {prompt_file_arg}")
                try:
                    with open(prompt_file_arg, 'r', encoding='utf-8') as f:
                        prompt_template = f.read()
                except Exception as e:
                    print(f"⚠️  Failed to load prompt from file: {e}", file=sys.stderr)
                    print("   Falling back to default prompt", file=sys.stderr)
            else:
                print(f"⚠️  Prompt file not found: {prompt_file_arg}", file=sys.stderr)
                print("   Falling back to default prompt", file=sys.stderr)
    
    # Priority 3: Default prompt
    if prompt_template is None:
        print("   Using default prompt")
        if output_format == "jsonld":
            prompt_template = DEFAULT_JSONLD_PROMPT.replace("{guidelines}", DEFAULT_PROMPT_GUIDELINES)
        else:
            prompt_template = DEFAULT_TURTLE_PROMPT.replace("{guidelines}", DEFAULT_PROMPT_GUIDELINES)
    
    # Validate that prompt contains required placeholders
    if "{document_text}" not in prompt_template:
        print("\n" + "=" * 70, file=sys.stderr)
        print("⚠️  WARNING: Your prompt does NOT contain {document_text} placeholder!", file=sys.stderr)
        print("   The actual document content will NOT be included in the prompt.", file=sys.stderr)
        print("   This will likely result in hallucinated output.", file=sys.stderr)
        print("=" * 70 + "\n", file=sys.stderr)
    else:
        debug_print("Prompt contains {document_text} placeholder ✓")
    
    if "{source_uri}" not in prompt_template:
        print("⚠️  WARNING: Your prompt does not contain {source_uri} placeholder", file=sys.stderr)
    else:
        debug_print("Prompt contains {source_uri} placeholder ✓")
    
    return prompt_template


def load_pdf_with_pypdf(file_path: str) -> str:
    """Load PDF using pypdf."""
    if not HAS_PYPDF:
        return ""
    
    debug_print(f"Trying pypdf for: {file_path}")
    try:
        reader = pypdf.PdfReader(file_path)
        text_parts = []
        for i, page in enumerate(reader.pages):
            page_text = page.extract_text()
            if page_text:
                text_parts.append(f"--- Page {i+1} ---\n{page_text}")
                debug_print(f"  Page {i+1}: {len(page_text)} chars extracted")
        return "\n\n".join(text_parts)
    except Exception as e:
        debug_print(f"pypdf failed: {e}")
        return ""


def load_pdf_with_pdfplumber(file_path: str) -> str:
    """Load PDF using pdfplumber (better for complex layouts)."""
    if not HAS_PDFPLUMBER:
        return ""
    
    debug_print(f"Trying pdfplumber for: {file_path}")
    try:
        text_parts = []
        with pdfplumber.open(file_path) as pdf:
            for i, page in enumerate(pdf.pages):
                page_text = page.extract_text()
                if page_text:
                    text_parts.append(f"--- Page {i+1} ---\n{page_text}")
                    debug_print(f"  Page {i+1}: {len(page_text)} chars extracted")
                
                # Also try to extract tables
                tables = page.extract_tables()
                for j, table in enumerate(tables):
                    if table:
                        table_text = "\n".join(["\t".join([str(cell or "") for cell in row]) for row in table])
                        text_parts.append(f"--- Table {j+1} on Page {i+1} ---\n{table_text}")
        
        return "\n\n".join(text_parts)
    except Exception as e:
        debug_print(f"pdfplumber failed: {e}")
        return ""


def load_pdf(file_path: str) -> str:
    """
    Load PDF file using multiple methods for robustness.
    Tries pdfplumber first (best for complex layouts), then pypdf.
    """
    content = ""
    method_used = None
    
    # Method 1: pdfplumber (best for complex layouts like financial docs)
    if HAS_PDFPLUMBER:
        content = load_pdf_with_pdfplumber(file_path)
        if content.strip():
            method_used = "pdfplumber"
    
    # Method 2: pypdf (reliable fallback)
    if not content.strip() and HAS_PYPDF:
        content = load_pdf_with_pypdf(file_path)
        if content.strip():
            method_used = "pypdf"
    
    # Report results
    if content.strip():
        print(f"   PDF extracted using: {method_used}")
        print(f"   Extracted text length: {len(content)} characters")
    else:
        print("⚠️  WARNING: No text could be extracted from PDF!", file=sys.stderr)
        print("   The PDF may be scanned/image-based or encrypted.", file=sys.stderr)
    
    return content


def load_docx(file_path: str) -> str:
    """Load Word document using python-docx."""
    if not HAS_DOCX:
        raise ImportError("python-docx not installed. Run: pip install python-docx")
    
    doc = DocxDocument(file_path)
    text_parts = []
    
    for para in doc.paragraphs:
        if para.text.strip():
            text_parts.append(para.text)
    
    # Also extract from tables
    for table in doc.tables:
        for row in table.rows:
            row_text = " | ".join([cell.text for cell in row.cells])
            if row_text.strip():
                text_parts.append(row_text)
    
    return "\n\n".join(text_parts)


def load_pptx(file_path: str) -> str:
    """Load PowerPoint file using python-pptx."""
    if not HAS_PPTX:
        raise ImportError("python-pptx not installed. Run: pip install python-pptx")
    
    prs = Presentation(file_path)
    text_parts = []
    
    for slide_idx, slide in enumerate(prs.slides, 1):
        text_parts.append(f"--- Slide {slide_idx} ---")
        
        for shape in slide.shapes:
            if hasattr(shape, "text") and shape.text.strip():
                text_parts.append(shape.text)
            
            # Extract from tables
            if shape.has_table:
                table = shape.table
                for row in table.rows:
                    row_text = " | ".join([cell.text for cell in row.cells])
                    if row_text.strip():
                        text_parts.append(row_text)
    
    return "\n\n".join(text_parts)


def load_excel(file_path: str) -> str:
    """Load Excel file using pandas."""
    if not HAS_PANDAS:
        raise ImportError("pandas not installed. Run: pip install pandas openpyxl")
    
    text_parts = []
    
    # Read all sheets
    excel_file = pd.ExcelFile(file_path)
    for sheet_name in excel_file.sheet_names:
        df = pd.read_excel(file_path, sheet_name=sheet_name)
        text_parts.append(f"--- Sheet: {sheet_name} ---")
        text_parts.append(df.to_string())
    
    return "\n\n".join(text_parts)


def load_csv(file_path: str) -> str:
    """Load CSV file using pandas."""
    if not HAS_PANDAS:
        raise ImportError("pandas not installed. Run: pip install pandas")
    
    df = pd.read_csv(file_path)
    return df.to_string()


def load_markdown(file_path: str) -> str:
    """Load Markdown file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()


def load_html(file_path: str) -> str:
    """Load HTML file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Extract text from HTML
    if HAS_BS4:
        soup = BeautifulSoup(content, "html.parser")
        # Remove script and style elements
        for element in soup(["script", "style"]):
            element.decompose()
        return soup.get_text(separator="\n", strip=True)
    else:
        # Basic HTML tag removal
        content = re.sub(r'<[^>]+>', ' ', content)
        return re.sub(r'\s+', ' ', content).strip()


def load_text(file_path: str) -> str:
    """Load plain text file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()


def load_source(source: str) -> Tuple[str, str]:
    """
    Load content from source (URL or file).
    Returns (content_text, source_uri)
    """
    if is_url(source):
        print(f"   Source type: URL")
        content = fetch_url_content(source)
        return content, source
    
    # It's a file path
    if not os.path.exists(source):
        raise FileNotFoundError(f"File not found: {source}")
    
    file_type = get_file_type(source)
    print(f"   Source type: {file_type.upper()} file")
    
    # Load based on file type
    loaders = {
        'pdf': load_pdf,
        'docx': load_docx,
        'pptx': load_pptx,
        'excel': load_excel,
        'csv': load_csv,
        'markdown': load_markdown,
        'html': load_html,
        'text': load_text,
    }
    
    loader = loaders.get(file_type)
    if not loader:
        raise ValueError(f"Unsupported file type: {file_type}")
    
    content = loader(source)
    
    # Generate a file URI for the source
    file_path = Path(source).resolve()
    source_uri = file_path.as_uri()
    
    return content, source_uri


# ============================================================================
# MULTI-PROVIDER LLM API FUNCTIONS
# ============================================================================

def call_llm_api(
    prompt: str,
    model: str,
    provider: str = "openai",
    api_endpoint: str = None,
    api_key: str = None,
    max_completion_tokens: int = 16384,
    temperature: float = 0,
    debug: bool = False
) -> str:
    """
    Call any LLM provider with OpenAI-compatible or native API.
    
    Args:
        prompt: The prompt to send
        model: Model name/ID
        provider: One of: openai, claude, gemini, grok, mistral, ollama, lmstudio, custom
        api_endpoint: Override endpoint (for custom providers)
        api_key: API key (if not set via env var)
        max_completion_tokens: Max tokens in response
        temperature: Temperature setting
        debug: Enable debug output
    
    Returns:
        Generated text response
    
    Raises:
        ValueError: If provider not supported or configuration invalid
        requests.exceptions.RequestException: If API call fails
    """
    
    if provider not in LLM_PROVIDERS:
        raise ValueError(f"Unknown provider: {provider}. Supported: {list(LLM_PROVIDERS.keys())}")
    
    config = LLM_PROVIDERS[provider]
    
    # Get endpoint - use explicit override ONLY if provided, otherwise use provider's endpoint
    if api_endpoint:
        endpoint = api_endpoint
    else:
        endpoint = config.get("endpoint")
    
    if not endpoint:
        raise ValueError(f"No endpoint configured for {provider}. Use --api-endpoint")
    
    # Get API key (skip for providers that don't need auth)
    if not config.get("no_auth"):
        key_env = config.get("api_key_env")
        if key_env:
            api_key = api_key or os.getenv(key_env)
            if not api_key:
                raise ValueError(f"API key required. Set {key_env} environment variable or use --api-key")
    
    if debug:
        print(f"🔍 DEBUG: Provider: {provider}")
        print(f"🔍 DEBUG: Calling {config['name']} API")
        print(f"🔍 DEBUG: Endpoint: {endpoint}")
        print(f"🔍 DEBUG: Model: {model}")
    
    # Dispatch to appropriate API type
    api_type = config.get("api_type", "openai")
    
    if api_type == "openai":
        return _call_openai_api(endpoint, model, prompt, api_key, max_completion_tokens, temperature, config, debug)
    elif api_type == "openai_compat":
        return _call_openai_compatible(endpoint, model, prompt, api_key, max_completion_tokens, temperature, config, debug)
    elif api_type == "anthropic":
        return _call_anthropic_api(endpoint, model, prompt, api_key, max_completion_tokens, debug)
    else:
        raise ValueError(f"Unknown API type: {api_type}")


def _call_openai_api(endpoint, model, prompt, api_key, max_completion_tokens, temperature, config, debug):
    """Call native OpenAI API (supports max_completion_tokens)"""
    
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    
    # OpenAI supports the newer max_completion_tokens parameter
    payload = {
        "model": model,
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "temperature": temperature,
        "max_completion_tokens": max_completion_tokens
    }
    
    try:
        if debug:
            print(f"🔍 DEBUG: POST to {endpoint}")
        
        response = requests.post(endpoint, headers=headers, json=payload, timeout=120)
        response.raise_for_status()
        
        result = response.json()
        
        if debug:
            print(f"🔍 DEBUG: Response status: {response.status_code}")
        
        if "choices" in result and len(result["choices"]) > 0:
            content = result["choices"][0]["message"]["content"].strip()
            if debug:
                print(f"🔍 DEBUG: Response length: {len(content)} chars")
            return content
        else:
            raise ValueError(f"Unexpected response format: {result}")
    
    except requests.exceptions.RequestException as e:
        raise ValueError(f"OpenAI API call failed: {e}")


def _call_openai_compatible(endpoint, model, prompt, api_key, max_completion_tokens, temperature, config, debug):
    """Call OpenAI-compatible API (Grok, Mistral, Gemini, Ollama, LM Studio)"""
    
    headers = {
        "Content-Type": "application/json"
    }
    
    # Only add auth if required
    if not config.get("no_auth") and api_key:
        headers["Authorization"] = f"Bearer {api_key}"
    
    # Get the correct max_tokens parameter name for this provider
    max_tokens_param = config.get("max_tokens_param", "max_tokens")
    
    payload = {
        "model": model,
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "temperature": temperature,
        max_tokens_param: max_completion_tokens
    }
    
    # Handle providers that use query parameters for API key (e.g., Gemini)
    url = endpoint
    if config.get("uses_query_param") and api_key:
        url = f"{endpoint}?key={api_key}"
        # Remove Authorization header when using query param
        headers.pop("Authorization", None)
    
    try:
        if debug:
            print(f"🔍 DEBUG: POST to {url}")
            print(f"🔍 DEBUG: max_tokens param: {max_tokens_param}")
        
        response = requests.post(url, headers=headers, json=payload, timeout=120)
        response.raise_for_status()
        
        result = response.json()
        
        if debug:
            print(f"🔍 DEBUG: Response status: {response.status_code}")
        
        # Standard OpenAI-compatible response format
        if "choices" in result and len(result["choices"]) > 0:
            content = result["choices"][0]["message"]["content"].strip()
            if debug:
                print(f"🔍 DEBUG: Response length: {len(content)} chars")
            return content
        else:
            raise ValueError(f"Unexpected response format: {result}")
    
    except requests.exceptions.HTTPError as e:
        error_detail = ""
        try:
            error_detail = e.response.json()
        except:
            error_detail = e.response.text
        raise ValueError(f"API call failed: {e}\nDetails: {error_detail}")
    except requests.exceptions.RequestException as e:
        raise ValueError(f"API call failed: {e}")


def _call_anthropic_api(endpoint, model, prompt, api_key, max_completion_tokens, debug):
    """Call Anthropic Claude API (native format, not OpenAI-compatible)"""
    
    # FIXED: Anthropic uses x-api-key header, NOT Authorization: Bearer
    headers = {
        "x-api-key": api_key,
        "Content-Type": "application/json",
        "anthropic-version": "2023-06-01"
    }
    
    # Anthropic uses different message format
    payload = {
        "model": model,
        "max_tokens": max_completion_tokens,
        "messages": [
            {"role": "user", "content": prompt}
        ]
    }
    
    try:
        if debug:
            print(f"🔍 DEBUG: POST to {endpoint}")
            print(f"🔍 DEBUG: Using x-api-key header for Anthropic auth")
        
        response = requests.post(endpoint, headers=headers, json=payload, timeout=120)
        response.raise_for_status()
        
        result = response.json()
        
        if debug:
            print(f"🔍 DEBUG: Response status: {response.status_code}")
        
        # Anthropic response format
        if "content" in result and len(result["content"]) > 0:
            content = result["content"][0]["text"].strip()
            if debug:
                print(f"🔍 DEBUG: Response length: {len(content)} chars")
            return content
        else:
            raise ValueError(f"Unexpected response format: {result}")
    
    except requests.exceptions.HTTPError as e:
        error_detail = ""
        try:
            error_detail = e.response.json()
        except:
            error_detail = e.response.text
        raise ValueError(f"Anthropic API call failed: {e}\nDetails: {error_detail}")
    except requests.exceptions.RequestException as e:
        raise ValueError(f"Anthropic API call failed: {e}")


def generate_rdf(
    document_text: str,
    llm_api_key: str,
    llm_model: str,
    source_uri: str,
    prompt_template: str,
    llm_provider: str = "openai",
    api_endpoint: str = None,
    max_content_chars: int = 12000,
    max_completion_tokens: int = 16384
) -> str:
    """
    Ask LLM to generate comprehensive RDF using the provided prompt template.
    
    Args:
        document_text: The document content to process
        llm_api_key: API key for the LLM provider
        llm_model: Model name to use
        source_uri: Source document URI
        prompt_template: Prompt template with {document_text} and {source_uri} placeholders
        llm_provider: Which LLM provider to use
        api_endpoint: Override endpoint (for custom providers)
        max_content_chars: Max chars to include from document
        max_completion_tokens: Max completion tokens
    
    Returns:
        LLM response text
    """
    # Check if document text is empty
    if not document_text.strip():
        print("\n" + "=" * 70, file=sys.stderr)
        print("⚠️  CRITICAL: Document text is EMPTY!", file=sys.stderr)
        print("   The LLM will have no content to work with.", file=sys.stderr)
        print("   Output will be hallucinated.", file=sys.stderr)
        print("=" * 70 + "\n", file=sys.stderr)
    
    # Truncate document if too long
    original_length = len(document_text)
    if len(document_text) > max_content_chars:
        document_text = document_text[:max_content_chars] + "\n[... content truncated ...]"
        debug_print(f"Document truncated from {original_length} to {max_content_chars} chars")
    
    # Show preview of document content in debug mode
    debug_print(f"Document content preview (first 500 chars):\n{'-' * 40}")
    if DEBUG:
        print(document_text[:500])
        print(f"{'-' * 40}")
    
    # Check if placeholder exists before replacement
    has_doc_placeholder = "{document_text}" in prompt_template
    has_uri_placeholder = "{source_uri}" in prompt_template
    
    debug_print(f"Placeholder check - {{document_text}}: {has_doc_placeholder}, {{source_uri}}: {has_uri_placeholder}")
    
    # Replace placeholders in prompt
    prompt = prompt_template.replace("{source_uri}", source_uri)
    prompt = prompt.replace("{document_text}", document_text)
    
    # Verify replacement happened
    if has_doc_placeholder and document_text[:100] not in prompt:
        print("⚠️  WARNING: Document text may not have been inserted into prompt!", file=sys.stderr)
    
    debug_print(f"Final prompt length: {len(prompt)} chars")
    
    # Show full prompt in debug mode
    if DEBUG:
        print(f"\n{'=' * 40} FULL PROMPT {'=' * 40}")
        print(prompt[:2000])
        if len(prompt) > 2000:
            print(f"\n... [{len(prompt) - 2000} more characters] ...")
        print(f"{'=' * 93}\n")
    
    try:
        debug_print(f"Calling {llm_provider} API with model: {llm_model}")
        content = call_llm_api(
            prompt,
            llm_model,
            provider=llm_provider,
            api_endpoint=api_endpoint,
            api_key=llm_api_key,
            max_completion_tokens=max_completion_tokens,
            temperature=0,
            debug=DEBUG
        )
        debug_print(f"Extracted content length: {len(content)} chars")
        if not content:
            print("\n" + "=" * 70, file=sys.stderr)
            print(f"⚠️  WARNING: {LLM_PROVIDERS[llm_provider]['name']} returned empty response!", file=sys.stderr)
            print("=" * 70 + "\n", file=sys.stderr)
        return content
            
    except ValueError as e:
        print(f"{LLM_PROVIDERS[llm_provider]['name']} API error: {e}", file=sys.stderr)
        raise
    except requests.exceptions.RequestException as e:
        print(f"{LLM_PROVIDERS[llm_provider]['name']} API error: {e}", file=sys.stderr)
        raise


def extract_turtle(llm_response: str) -> Optional[str]:
    """Extract Turtle RDF from LLM response."""
    
    # Try to find Turtle in code blocks first
    code_block_match = re.search(r'```(?:turtle|ttl|n3|rdf)?\s*\n?([\s\S]*?)\n?```', llm_response, re.IGNORECASE)
    if code_block_match:
        turtle_text = code_block_match.group(1).strip()
    else:
        # Look for content starting with @base or @prefix
        turtle_match = re.search(r'(@(?:base|prefix)[\s\S]*?)(?=\n\n(?:Following|Additional|Note:|---)|$)', llm_response, re.IGNORECASE)
        if turtle_match:
            turtle_text = turtle_match.group(1).strip()
        else:
            # Take everything before "Following your initial response"
            parts = re.split(r'\n\s*Following your initial response', llm_response, flags=re.IGNORECASE)
            if parts:
                turtle_text = parts[0].strip()
                # Remove any markdown code block markers
                turtle_text = re.sub(r'^```\w*\n?', '', turtle_text)
                turtle_text = re.sub(r'\n?```$', '', turtle_text)
            else:
                return None
    
    # Fix common issues
    turtle_text = fix_turtle_issues(turtle_text)
    
    return turtle_text


def extract_jsonld(llm_response: str) -> Optional[str]:
    """Extract JSON-LD from LLM response."""
    
    # Try to find JSON-LD in code blocks first
    code_block_match = re.search(r'```(?:json-ld|json|jsonld)?\s*\n?([\s\S]*?)\n?```', llm_response, re.IGNORECASE)
    if code_block_match:
        jsonld_text = code_block_match.group(1).strip()
    else:
        # Try to find raw JSON object
        json_match = re.search(r'(\{[\s\S]*\})', llm_response)
        if json_match:
            jsonld_text = json_match.group(1).strip()
        else:
            return None
    
    # Fix common issues
    jsonld_text = fix_json_issues(jsonld_text)
    
    # Validate it's proper JSON
    try:
        json.loads(jsonld_text)
        return jsonld_text
    except json.JSONDecodeError as e:
        print(f"⚠️  JSON parsing warning: {e}", file=sys.stderr)
        return None


def fix_turtle_issues(turtle_text: str) -> str:
    """Attempt to fix common Turtle syntax issues."""
    # Replace smart quotes with regular quotes
    turtle_text = turtle_text.replace('"', '"').replace('"', '"')
    turtle_text = turtle_text.replace(''', "'").replace(''', "'")
    
    # Ensure proper line endings
    turtle_text = turtle_text.replace('\r\n', '\n')
    
    return turtle_text


def fix_json_issues(jsonld_text: str) -> str:
    """Attempt to fix common JSON issues."""
    # Replace smart quotes with regular quotes
    jsonld_text = jsonld_text.replace('"', '"').replace('"', '"')
    jsonld_text = jsonld_text.replace(''', "'").replace(''', "'")
    
    # Remove trailing commas before closing brackets
    jsonld_text = re.sub(r',(\\s*[}\\]])', r'\1', jsonld_text)
    
    return jsonld_text


def validate_rdf(rdf_text: str, rdf_format: str) -> Tuple[bool, Optional[str]]:
    """Validate RDF using rdflib and return (is_valid, error_message)."""
    if not HAS_RDFLIB:
        return True, None  # Skip validation if rdflib not available
    
    try:
        g = Graph()
        if rdf_format == "jsonld":
            g.parse(data=rdf_text, format='json-ld')
        else:
            g.parse(data=rdf_text, format='turtle')
        return True, None
    except Exception as e:
        return False, str(e)


def rdf_to_ntriples(rdf_text: str, rdf_format: str) -> Optional[str]:
    """Convert RDF (Turtle or JSON-LD) to N-Triples format using rdflib."""
    if not HAS_RDFLIB:
        print("⚠️  rdflib not installed, skipping RDF conversion", file=sys.stderr)
        return None
    
    try:
        g = Graph()
        if rdf_format == "jsonld":
            g.parse(data=rdf_text, format='json-ld')
        else:
            g.parse(data=rdf_text, format='turtle')
        return g.serialize(format='nt')
    except Exception as e:
        print(f"⚠️  RDF to N-Triples conversion error: {e}", file=sys.stderr)
        return None


def parse_turtle_for_sparql(turtle: str) -> Tuple[str, str]:
    """
    Separate prefix declarations from triple content for SPARQL INSERT.
    Returns (prefixes_as_sparql, triples)
    """
    lines = turtle.split('\n')
    prefix_lines = []
    triple_lines = []
    
    for line in lines:
        stripped = line.strip()
        if stripped.startswith('@base'):
            # Extract base URI for SPARQL
            base_match = re.match(r'@base\s+<([^>]+)>\s*\.?', stripped)
            if base_match:
                base_uri = base_match.group(1)
                prefix_lines.append(f"BASE <{base_uri}>")
        elif stripped.startswith('@prefix') or stripped.startswith('PREFIX'):
            # Convert @prefix to SPARQL PREFIX format
            if stripped.startswith('@prefix'):
                prefix_match = re.match(r'@prefix\s+(\S+)\s+(<[^>]+>)\s*\.?', stripped)
                if prefix_match:
                    prefix_lines.append(f"PREFIX {prefix_match.group(1)} {prefix_match.group(2)}")
            else:
                prefix_lines.append(stripped.rstrip('.'))
        elif stripped and not stripped.startswith('#'):
            triple_lines.append(line)
    
    return '\n'.join(prefix_lines), '\n'.join(triple_lines)


def generate_output_filename(source: str, output_format: str, output_dir: str) -> str:
    """Generate an output filename based on source and format."""
    # Create safe filename from source
    if is_url(source):
        # Extract domain and path for URL
        parsed = urlparse(source)
        safe_name = f"{parsed.netloc}_{parsed.path}"
        safe_name = re.sub(r'[^\w\-]', '_', safe_name)[:60]
    else:
        safe_name = Path(source).stem
    
    # Add timestamp for uniqueness
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Get extension
    ext = FORMAT_EXTENSIONS.get(output_format, '.ttl')
    
    return os.path.join(output_dir, f"{safe_name}_{timestamp}{ext}")


def save_rdf_to_file(
    rdf_text: str,
    output_file: str,
    output_format: str
) -> bool:
    """Save RDF to a local file."""
    try:
        # Create directory if it doesn't exist
        output_dir = os.path.dirname(output_file)
        if output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(rdf_text)
        
        return True
    except Exception as e:
        print(f"ERROR: Failed to save RDF to file: {e}", file=sys.stderr)
        return False


def upload_to_sparql(
    rdf_text: str,
    rdf_format: str,
    graph_iri: str,
    sparql_endpoint: str,
    user: str,
    password: str
) -> bool:
    """Upload RDF to SPARQL endpoint using SPARQL UPDATE."""
    
    # Try converting to N-Triples first (most reliable)
    ntriples = rdf_to_ntriples(rdf_text, rdf_format)
    
    if ntriples:
        # Use N-Triples for insertion
        sparql_update = f"""
INSERT DATA {{
    GRAPH <{graph_iri}> {{
        {ntriples}
    }}
}}
"""
    elif rdf_format == "turtle":
        # Parse Turtle and insert directly
        prefixes, triples = parse_turtle_for_sparql(rdf_text)
        sparql_update = f"""
{prefixes}

INSERT DATA {{
    GRAPH <{graph_iri}> {{
        {triples}
    }}
}}
"""
    else:
        # Try direct upload via Graph Store Protocol for JSON-LD
        return upload_direct(rdf_text, rdf_format, graph_iri, sparql_endpoint, user, password)
    
    # Execute the update
    response = requests.post(
        sparql_endpoint,
        data={"update": sparql_update},
        auth=(user, password) if user else None,
        headers={"Content-Type": "application/x-www-form-urlencoded"}
    )
    
    if response.status_code != 200:
        print(f"Error uploading to SPARQL endpoint: {response.status_code}", file=sys.stderr)
        print(f"Response: {response.text}", file=sys.stderr)
        # Try direct upload as fallback
        return upload_direct(rdf_text, rdf_format, graph_iri, sparql_endpoint, user, password)
    
    return True


def upload_direct(
    rdf_text: str,
    rdf_format: str,
    graph_iri: str,
    sparql_endpoint: str,
    user: str,
    password: str
) -> bool:
    """
    Upload RDF directly to SPARQL endpoint using the Graph Store Protocol.
    Fallback if SPARQL UPDATE fails.
    """
    # Try using the graph store protocol endpoint
    graph_store_url = sparql_endpoint.replace('/sparql', '/sparql-graph-crud-auth')
    
    if rdf_format == "jsonld":
        content_type = "application/ld+json"
    else:
        content_type = "text/turtle"
    
    response = requests.post(
        graph_store_url,
        params={"graph-uri": graph_iri},
        data=rdf_text.encode('utf-8'),
        auth=(user, password) if user else None,
        headers={"Content-Type": content_type}
    )
    
    if response.status_code not in [200, 201, 204]:
        print(f"Error uploading RDF directly: {response.status_code}", file=sys.stderr)
        print(f"Response: {response.text}", file=sys.stderr)
        return False
    
    return True


def verify_upload(
    graph_iri: str,
    sparql_endpoint: str,
    user: str,
    password: str
):
    """Verify the upload by querying for content."""
    
    # Query for general entity types
    query = f"""
PREFIX schema: <http://schema.org/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT ?type (COUNT(?s) AS ?count)
WHERE {{
    GRAPH <{graph_iri}> {{
        ?s rdf:type ?type .
    }}
}}
GROUP BY ?type
ORDER BY DESC(?count)
LIMIT 15
"""
    
    response = requests.get(
        sparql_endpoint,
        params={"query": query, "format": "application/json"},
        auth=(user, password) if user else None
    )
    
    if response.status_code == 200:
        results = response.json()
        bindings = results.get("results", {}).get("bindings", [])
        print(f"\n📋 Verification: Entity types in graph <{graph_iri}>:\n")
        if bindings:
            for binding in bindings:
                entity_type = binding.get("type", {}).get("value", "")
                count = binding.get("count", {}).get("value", "")
                print(f"   {entity_type}: {count}")
        else:
            print("   No entity types found.")
    else:
        print(f"⚠️  Verification query failed: {response.text}", file=sys.stderr)


def save_response(
    llm_response: str,
    source: str,
    output_format: str,
    output_dir: str = "."
):
    """Save the full LLM response to a markdown file."""
    
    # Create safe filename from source
    if is_url(source):
        safe_name = re.sub(r'[^\w]', '_', source)[:50]
    else:
        safe_name = Path(source).stem
    
    # Add timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Save full LLM response (includes suggestions)
    response_file = os.path.join(output_dir, f"{safe_name}_{timestamp}_response.md")
    
    try:
        # Create directory if needed
        if output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        with open(response_file, 'w', encoding='utf-8') as f:
            f.write(f"# RDF Generation Response\n\n")
            f.write(f"**Source:** {source}\n")
            f.write(f"**Format:** {output_format}\n")
            f.write(f"**Generated:** {datetime.now().isoformat()}\n\n")
            f.write("---\n\n")
            f.write(llm_response)
        
        print(f"💾 Full response saved to: {response_file}")
    except Exception as e:
        print(f"⚠️  Failed to save response: {e}", file=sys.stderr)


def print_supported_formats():
    """Print information about supported file formats."""
    print("\n📁 Supported Input Formats:")
    print("   URLs:        HTML web pages")
    print("   PDF:         .pdf")
    print("   Word:        .docx, .doc")
    print("   Excel:       .xlsx, .xls")
    print("   PowerPoint:  .pptx, .ppt")
    print("   CSV:         .csv")
    print("   Markdown:    .md, .markdown")
    print("   HTML:        .html, .htm")
    print("   Text:        .txt")
    print()


def print_llm_providers():
    """Print information about supported LLM providers."""
    print("\n🤖 Supported LLM Providers:\n")
    for provider_id, config in LLM_PROVIDERS.items():
        api_type = config.get("api_type", "unknown")
        if api_type == "anthropic":
            api_type_label = "Anthropic native API"
        elif api_type == "openai":
            api_type_label = "OpenAI native API"
        else:
            api_type_label = "OpenAI-compatible API"
        
        auth_label = "(no auth required)" if config.get("no_auth") else f"(env: {config.get('api_key_env')})"
        print(f"   {provider_id:12} - {config['name']:30} {auth_label}")
        print(f"   {'':12}   Default model: {config['default_model']}")
        print(f"   {'':12}   API type: {api_type_label}")
    print()


def print_prompt_help():
    """Print information about prompt customization."""
    print("\n📝 Prompt Customization:")
    print("   --prompt \"text\"           Provide prompt directly on command line")
    print("   --prompt-file /path/file  Load prompt from local file")
    print("   --prompt-file https://... Load prompt from remote URL")
    print()
    print("   IMPORTANT: Your prompt MUST contain these placeholders:")
    print("   {source_uri}      - Replaced with the source document URI")
    print("   {document_text}   - Replaced with the extracted document content")
    print()
    print("   If {document_text} is missing, the LLM will hallucinate!")
    print()


def print_output_modes():
    """Print information about output modes."""
    print("\n📤 Output Modes:")
    print("   sparql   Upload to SPARQL endpoint (default)")
    print("   file     Save to local file only (no upload)")
    print("   both     Upload to SPARQL endpoint AND save to local file")
    print()


def check_dependencies():
    """Check and report on available dependencies."""
    issues = []
    
    if not HAS_DOCX:
        issues.append("python-docx not installed (needed for DOCX)")
    
    if not HAS_PPTX:
        issues.append("python-pptx not installed (needed for PPTX)")
    
    if not HAS_PANDAS:
        issues.append("pandas not installed (needed for Excel and CSV)")
    
    if not HAS_RDFLIB:
        issues.append("rdflib not installed (needed for RDF validation)")
    
    if not HAS_BS4:
        issues.append("beautifulsoup4 not installed (recommended for better HTML parsing)")
    
    if not HAS_PYPDF:
        issues.append("pypdf not installed (fallback PDF reader)")
    
    if not HAS_PDFPLUMBER:
        issues.append("pdfplumber not installed (best for complex PDF layouts)")
    
    if issues:
        print("⚠️  Optional dependencies missing:")
        for issue in issues:
            print(f"   - {issue}")
        print()
        print("   Install all dependencies with:")
        print("   pip install requests beautifulsoup4 rdflib pypdf pdfplumber python-docx python-pptx pandas openpyxl")
        print()


def main():
    global DEBUG
    
    parser = argparse.ArgumentParser(
        description="Generate comprehensive RDF from various document types using any LLM provider",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--source",
        default=DEFAULT_SOURCE,
        help="URL or file path of the source document"
    )
    parser.add_argument(
        "--graph-iri",
        default=DEFAULT_GRAPH_IRI,
        help="Named graph IRI for storing the RDF"
    )
    parser.add_argument(
        "--format",
        choices=["turtle", "jsonld"],
        default="turtle",
        help="Output RDF format"
    )
    parser.add_argument(
        "--output-mode",
        choices=["sparql", "file", "both"],
        default=DEFAULT_OUTPUT_MODE,
        help="Output mode"
    )
    parser.add_argument(
        "--output-file",
        default=None,
        help="Output file path"
    )
    parser.add_argument(
        "--output-dir",
        default=".",
        help="Directory for output files"
    )
    parser.add_argument(
        "--llm-provider",
        choices=list(LLM_PROVIDERS.keys()),
        default=DEFAULT_LLM_PROVIDER,
        help="Which LLM provider to use"
    )
    parser.add_argument(
        "--api-endpoint",
        default=None,
        help="Override API endpoint (for custom providers)"
    )
    parser.add_argument(
        "--api-key",
        default=None,
        help="API key (if not set via environment variable)"
    )
    parser.add_argument(
        "--api-key-env",
        default=None,
        help="Environment variable name for API key"
    )
    parser.add_argument(
        "--model",
        default=None,
        help="Model to use (defaults to provider's default)"
    )
    parser.add_argument(
        "--prompt",
        default=None,
        help="Custom prompt text (inline)"
    )
    parser.add_argument(
        "--prompt-file",
        default=None,
        help="Path or URL to custom prompt file"
    )
    parser.add_argument(
        "--sparql-endpoint",
        default=DEFAULT_SPARQL_ENDPOINT,
        help="SPARQL endpoint URL"
    )
    parser.add_argument(
        "--user",
        default=DEFAULT_USER,
        help="SPARQL endpoint username"
    )
    parser.add_argument(
        "--password",
        default=DEFAULT_PASSWORD,
        help="SPARQL endpoint password"
    )
    parser.add_argument(
        "--verify",
        action="store_true",
        help="Verify upload with a test query"
    )
    parser.add_argument(
        "--save-response",
        action="store_true",
        help="Save full LLM response to markdown file"
    )
    parser.add_argument(
        "--max-content-chars",
        type=int,
        default=12000,
        help="Maximum characters of document content to include in prompt"
    )
    parser.add_argument(
        "--max-completion-tokens",
        type=int,
        default=16384,
        help="Maximum completion tokens for LLM API"
    )
    parser.add_argument(
        "--validate-only",
        action="store_true",
        help="Generate and validate RDF without output"
    )
    parser.add_argument(
        "--debug",
        action="store_true",
        help="Enable debug output"
    )
    parser.add_argument(
        "--list-formats",
        action="store_true",
        help="List supported input formats and exit"
    )
    parser.add_argument(
        "--list-providers",
        action="store_true",
        help="List supported LLM providers and exit"
    )
    parser.add_argument(
        "--prompt-help",
        action="store_true",
        help="Show prompt customization help and exit"
    )
    parser.add_argument(
        "--output-help",
        action="store_true",
        help="Show output mode help and exit"
    )

    args = parser.parse_args()
    
    # Set debug flag
    DEBUG = args.debug

    # Show help and exit if requested
    if args.prompt_help:
        print_prompt_help()
        sys.exit(0)

    if args.output_help:
        print_output_modes()
        sys.exit(0)

    if args.list_formats:
        print_supported_formats()
        check_dependencies()
        sys.exit(0)

    if args.list_providers:
        print_llm_providers()
        sys.exit(0)

    # Determine API key
    provider_config = LLM_PROVIDERS[args.llm_provider]
    key_env = args.api_key_env or provider_config.get("api_key_env")
    
    api_key = args.api_key
    if not api_key and key_env:
        api_key = os.environ.get(key_env)
    
    # Only require API key if the provider needs auth
    if not provider_config.get("no_auth") and not api_key and key_env:
        print(f"ERROR: API key required. Set {key_env} environment variable or use --api-key", file=sys.stderr)
        sys.exit(1)

    # Determine model
    llm_model = args.model or provider_config.get("default_model")

    # Check dependencies
    check_dependencies()

    # Determine output file if needed
    output_file = args.output_file
    if args.output_mode in ["file", "both"] and not output_file:
        output_file = generate_output_filename(args.source, args.format, args.output_dir)

    # --- Display configuration ---
    print("🔧 Configuration:")
    print(f"   Source:           {args.source}")
    print(f"   Output Format:    {args.format}")
    print(f"   Output Mode:      {args.output_mode}")
    if args.output_mode in ["file", "both"]:
        print(f"   Output File:      {output_file}")
    if args.output_mode in ["sparql", "both"]:
        print(f"   Graph IRI:        {args.graph_iri}")
        print(f"   SPARQL Endpoint:  {args.sparql_endpoint}")
    print(f"   LLM Provider:     {args.llm_provider} ({provider_config['name']})")
    print(f"   Model:            {llm_model}")
    print(f"   Max Completion Tokens: {args.max_completion_tokens}")
    print(f"   Debug:            {args.debug}")
    print()

    # --- Load prompt ---
    print("📝 Loading prompt...")
    prompt_template = load_prompt(args.prompt, args.prompt_file, args.format)

    print("📄 Loading source...")
    try:
        document_text, source_uri = load_source(args.source)
        print(f"   Retrieved {len(document_text)} characters")
        print(f"   Source URI: {source_uri}")
        
        # Show content preview
        if document_text.strip():
            preview = document_text[:200].replace('\n', ' ')
            print(f"   Content preview: {preview}...")
        else:
            print("   ⚠️  WARNING: No content extracted from source!")
            
    except Exception as e:
        print(f"ERROR: Failed to load source: {e}", file=sys.stderr)
        sys.exit(1)

    print(f"\n🧠 Generating {args.format.upper()} RDF with {provider_config['name']}...")
    try:
        llm_response = generate_rdf(
            document_text,
            api_key,
            llm_model,
            source_uri,
            prompt_template,
            llm_provider=args.llm_provider,
            api_endpoint=args.api_endpoint,
            max_content_chars=args.max_content_chars,
            max_completion_tokens=args.max_completion_tokens
        )
        print("\n--- Full LLM Response ---")
        print(llm_response)
        print("--- End Response ---\n")
    except Exception as e:
        print(f"ERROR: Failed to generate RDF: {e}", file=sys.stderr)
        sys.exit(1)

    # Save full response if requested
    if args.save_response:
        save_response(llm_response, args.source, args.format, args.output_dir)

    # Extract RDF from response
    if args.format == "jsonld":
        rdf_text = extract_jsonld(llm_response)
    else:
        rdf_text = extract_turtle(llm_response)
    
    if not rdf_text:
        print(f"ERROR: Could not extract valid {args.format.upper()} from response", file=sys.stderr)
        sys.exit(1)
    
    print(f"✅ {args.format.upper()} extracted")
    
    # Validate RDF
    if HAS_RDFLIB:
        print("🔍 Validating RDF...")
        is_valid, error = validate_rdf(rdf_text, args.format)
        if is_valid:
            print("✅ RDF validation passed")
        else:
            print(f"⚠️  RDF validation warning: {error}", file=sys.stderr)

    # Skip output if validate-only
    if args.validate_only:
        print("\n🔍 Validate-only mode - skipping output")
        sys.exit(0)

    # --- Handle output based on mode ---
    success = True
    
    # File output
    if args.output_mode in ["file", "both"]:
        print(f"💾 Saving to file: {output_file}")
        if save_rdf_to_file(rdf_text, output_file, args.format):
            print(f"✅ RDF saved to: {output_file}")
        else:
            print("❌ Failed to save RDF to file", file=sys.stderr)
            success = False
    
    # SPARQL output
    if args.output_mode in ["sparql", "both"]:
        print("🗄️  Uploading to SPARQL endpoint...")
        if upload_to_sparql(
            rdf_text,
            args.format,
            args.graph_iri,
            args.sparql_endpoint,
            args.user,
            args.password
        ):
            print(f"✅ RDF successfully stored in graph <{args.graph_iri}>")
            
            if args.verify:
                verify_upload(
                    args.graph_iri,
                    args.sparql_endpoint,
                    args.user,
                    args.password
                )
        else:
            print("❌ Failed to upload RDF to SPARQL endpoint", file=sys.stderr)
            success = False
    
    if not success:
        sys.exit(1)


if __name__ == "__main__":
    main()
