Código-fonte para main

#!/usr/bin/env python3
"""
CLI for LLM local inference viability calculator.

Allows quickly discovering which models run on which GPU
for a given context size.

CLI para calculadora de viabilidade de inferência local de LLMs.
Permite descobrir rapidamente quais modelos rodam em qual GPU
para um determinado tamanho de contexto.
"""

import argparse
import csv
import json
import sys
from typing import List

from models import LLMModel, get_all_models, get_model_by_size
from gpus import GPU, get_all_gpus, get_consumer_gpus, get_datacenter_gpus, get_gpu_by_name
from calculator import (
    VRAMCalculator, Quantization, InferenceResult, Status, CalculationMode,
    LayerOffloadCalculator, LayerOffloadResult,
    CPUOffloadCalculator, CPUOffloadResult,
)
from formats import ModelFormat, detect_gguf_quantization, get_format_from_filename
from multi_gpu import (
    MultiGPUConfig, MultiGPUCalculator, MultiGPUMode,
    parse_gpu_config_string, create_multi_gpu_config,
)


# Terminal colors for enhanced readability
# Cores de terminal para melhor legibilidade

[documentos]
class Colors:
    """ANSI color codes for terminal output.

    Códigos de cores ANSI para saída de terminal.
    """
    RESET = "\033[0m"
    BOLD = "\033[1m"
    DIM = "\033[2m"

    # Foreground colors
    BLACK = "\033[30m"
    RED = "\033[31m"
    GREEN = "\033[32m"
    YELLOW = "\033[33m"
    BLUE = "\033[34m"
    MAGENTA = "\033[35m"
    CYAN = "\033[36m"
    WHITE = "\033[37m"

    # Background colors
    BG_RED = "\033[41m"
    BG_GREEN = "\033[42m"
    BG_YELLOW = "\033[43m"
    BG_BLUE = "\033[44m"


[documentos]
    @staticmethod
    def ok(text: str) -> str:
        """Green text for success/OK messages."""
        return f"{Colors.GREEN}{text}{Colors.RESET}"



[documentos]
    @staticmethod
    def warning(text: str) -> str:
        """Yellow text for warnings."""
        return f"{Colors.YELLOW}{text}{Colors.RESET}"



[documentos]
    @staticmethod
    def warn(text: str) -> str:
        """Yellow text for warnings (alias)."""
        return f"{Colors.YELLOW}{text}{Colors.RESET}"



[documentos]
    @staticmethod
    def error(text: str) -> str:
        """Red text for errors."""
        return f"{Colors.RED}{text}{Colors.RESET}"



[documentos]
    @staticmethod
    def info(text: str) -> str:
        """Blue text for info."""
        return f"{Colors.BLUE}{text}{Colors.RESET}"



[documentos]
    @staticmethod
    def dim(text: str) -> str:
        """Dim text for less emphasis."""
        return f"{Colors.DIM}{text}{Colors.RESET}"



[documentos]
    @staticmethod
    def bold(text: str) -> str:
        """Bold text."""
        return f"{Colors.BOLD}{text}{Colors.RESET}"





[documentos]
def print_table(
    results: List[InferenceResult],
    group_by_gpu: bool = False,
    show_only_runs: bool = False,
):
    """Prints results in ASCII table format.

    Imprime resultados em formato de tabela ASCII.

    Args:
        results: List of results / Lista de resultados
        group_by_gpu: Group by GPU instead of model / Agrupa por GPU
        show_only_runs: Show only running combinations / Apenas combinações que rodam
    """
    if show_only_runs:
        results = [r for r in results if r.status == Status.RUNS]

    if not results:
        print("\nNo combinations found. / Nenhuma combinação encontrada.")
        return

    # Header / Cabeçalho
    if group_by_gpu:
        header = f"{'GPU':<25} {'VRAM':<8} {'Model':<30} {'Needed':<10} {'Status':<10}"
    else:
        header = f"{'Model':<30} {'Needed':<10} {'GPU':<25} {'VRAM':<8} {'Status':<10}"

    separator = "-" * len(header)

    print(f"\n{header}")
    print(separator)

    # Sort: running first, then by VRAM required
    # Ordenar: roda primeiro, depois por VRAM necessária
    sorted_results = sorted(
        results,
        key=lambda r: (r.status != Status.RUNS, r.required_vram_gb),
    )

    for r in sorted_results:
        status_str = r.status.value
        if r.status == Status.RUNS:
            status_str = f"\033[92m{status_str}\033[0m"  # Green / Verde
        else:
            status_str = f"\033[91m{status_str}\033[0m"  # Red / Vermelho

        if group_by_gpu:
            row = (
                f"{r.gpu_name:<25} "
                f"{r.gpu_vram_gb:<8} "
                f"{f'{r.model_params_billion}B':<30} "
                f"{r.required_vram_gb:<10.1f} "
                f"{status_str:<10}"
            )
        else:
            row = (
                f"{f'{r.model_params_billion}B':<30} "
                f"{r.required_vram_gb:<10.1f} "
                f"{r.gpu_name:<25} "
                f"{r.gpu_vram_gb:<8} "
                f"{status_str:<10}"
            )

        print(row)

        # Show warning if present
        # Mostrar aviso se houver
        if r.warning:
            print(f"  ⚠️  {r.warning}")




[documentos]
def print_summary_by_model(results: List[InferenceResult]):
    """Prints summary grouped by model size.

    Imprime resumo agrupado por tamanho de modelo.

    Shows for each model which GPUs support it.
    Mostra para cada modelo quais GPUs suportam.
    """
    print("\n" + "=" * 70)
    print("SUMMARY BY MODEL / RESUMO POR MODELO")
    print("=" * 70)

    # Group by model size
    # Agrupar por tamanho de modelo
    from collections import defaultdict

    by_model: dict[int, List[InferenceResult]] = defaultdict(list)
    for r in results:
        by_model[r.model_params_billion].append(r)

    for size in sorted(by_model.keys()):
        runnable = [r for r in by_model[size] if r.status == Status.RUNS]
        not_runnable = [r for r in by_model[size] if r.status != Status.RUNS]

        print(f"\nModel {size}B:")

        if runnable:
            print(f"  ✓ RUNS on: / RODA em: {', '.join(sorted(set(r.gpu_name for r in runnable)))}")
        else:
            print(f"  ✗ Doesn't run on any listed GPU / Não roda em nenhuma GPU listada")

        if not_runnable:
            closest = min(not_runnable, key=lambda r: r.required_vram_gb - r.gpu_vram_gb)
            print(f"  ⚠️  Closest: / Mais próximo: {closest.gpu_name} (needs / precisa de {closest.required_vram_gb:.1f} GB)")



def print_summary_by_gpu(results: List[InferenceResult]):
    """Prints summary grouped by GPU.

    Imprime resumo agrupado por GPU.

    Shows for each GPU which models it supports.
    Mostra para cada GPU quais modelos suporta.
    """
    print("\n" + "=" * 70)
    print("SUMMARY BY GPU / RESUMO POR GPU")
    print("=" * 70)

    # Group by GPU
    # Agrupar por GPU
    from collections import defaultdict

    by_gpu: dict[str, List[InferenceResult]] = defaultdict(list)
    for r in results:
        by_gpu[r.gpu_name].append(r)

    # Sort GPUs by VRAM
    # Ordenar GPUs por VRAM
    gpu_vram = {r.gpu_name: r.gpu_vram_gb for r in results}
    sorted_gpus = sorted(by_gpu.keys(), key=lambda g: gpu_vram[g])

    for gpu_name in sorted_gpus:
        gpu_results = by_gpu[gpu_name]
        runnable = [r for r in gpu_results if r.status == Status.RUNS]

        vram = gpu_results[0].gpu_vram_gb
        print(f"\n{gpu_name} ({vram} GB):")

        if runnable:
            models = sorted(set(r.model_params_billion for r in runnable))
            print(f"  ✓ Supports: / Suporta: {', '.join(f'{m}B' for m in models)}")
        else:
            print(f"  ✗ Doesn't support any listed model / Não suporta nenhum modelo listado")



[documentos]
def print_layer_offload_result(result: LayerOffloadResult, model: LLMModel, gpu: GPU):
    """Prints layer offload configuration result.

    Imprime resultado da configuração de offload de camadas.

    Args:
        result: Layer offload calculation result
        model: LLM model
        gpu: GPU being used
    """
    print("\n" + "=" * 70)
    print(f"{Colors.BOLD}OPTIMAL LAYER OFFLOAD CONFIGURATION{Colors.RESET}")
    print("=" * 70)

    print(f"\n{Colors.CYAN}Model:{Colors.RESET} {model.name} ({model.params_billion}B parameters)")
    print(f"{Colors.CYAN}GPU:{Colors.RESET}   {gpu.name} ({gpu.vram_gb} GB VRAM)")
    print(f"{Colors.CYAN}Total Layers:{Colors.RESET} {result.total_layers}")

    print(f"\n{Colors.CYAN}Layer Distribution:{Colors.RESET}")
    print(f"  Layers on GPU:  {Colors.ok(str(result.layers_on_gpu))}")
    print(f"  Layers on CPU:  {Colors.warning(str(result.layers_on_cpu)) if result.layers_on_cpu > 0 else Colors.dim(str(result.layers_on_cpu))}")
    print(f"  Offload Ratio:  {result.offload_ratio:.1%}")

    print(f"\n{Colors.CYAN}Memory Usage:{Colors.RESET}")
    print(f"  GPU VRAM used:  {result.gpu_vram_used:.2f} GB / {gpu.vram_gb} GB")
    print(f"  CPU RAM used:   {result.cpu_ram_used:.2f} GB")
    if gpu.vram_gb > 0:
        gpu_util = (result.gpu_vram_used / gpu.vram_gb) * 100
        print(f"  GPU utilization: {gpu_util:.1f}%")

    print(f"\n{Colors.CYAN}Performance Impact:{Colors.RESET}")
    if result.status == "full_gpu":
        print(f"  {Colors.ok('✓ Full GPU acceleration - no performance impact')}")
    elif result.status == "cpu_only":
        print(f"  {Colors.error('✗ CPU only inference - ~10-20x slower')}")
    else:
        print(f"  Estimated slowdown: {Colors.warning(f'{result.performance_impact:.1f}%')}")
        if result.performance_impact < 20:
            print(f"  {Colors.info('Minimal impact - good for interactive use')}")
        elif result.performance_impact < 50:
            print(f"  {Colors.warning('Moderate impact - usable with some patience')}")
        else:
            print(f"  {Colors.error('Significant impact - consider more VRAM or smaller model')}")

    print(f"\n{Colors.CYAN}Recommended Configuration:{Colors.RESET}")
    print(f"  llama.cpp:  {Colors.bold(f'--gpu-layers {result.layers_on_gpu}')}")
    print(f"  AutoGPTQ:   {Colors.bold(f'--gpu-memory {result.gpu_vram_used:.1f}G')}")

    if result.status == "partial_offload":
        print(f"\n{Colors.DIM}Note: Layers on CPU are accessed via PCIe, which is slower than GPU VRAM.{Colors.RESET}")
        print(f"{Colors.DIM}      Consider quantization (INT4) to fit more layers on GPU.{Colors.RESET}")

    print("=" * 70)




[documentos]
def print_cpu_offload_result(result: CPUOffloadResult, model: LLMModel):
    """Prints CPU offload configuration result.

    Imprime resultado da configuração de offload de CPU.

    Args:
        result: CPU offload calculation result
        model: LLM model
    """
    print("\n" + "=" * 70)
    print(f"{Colors.BOLD}CPU OFFLOAD ANALYSIS{Colors.RESET}")
    print("=" * 70)

    print(f"\n{Colors.CYAN}System Requirements:{Colors.RESET}")
    print(f"  System RAM required: {result.system_ram_required:.2f} GB")
    print(f"  System RAM available: {result.system_ram_available:.2f} GB")
    if result.fits_in_ram:
        print(f"  Status: {Colors.ok('✓ Fits in system RAM')}")
    else:
        print(f"  Status: {Colors.error(f'✗ Need {result.system_ram_required - result.system_ram_available:.2f} GB more RAM')}")

    print(f"\n{Colors.CYAN}PCIe Configuration:{Colors.RESET}")
    print(f"  Generation: PCIe {result.pcie_generation}")
    pcie_bandwidth = {"3.0": 12, "4.0": 24, "5.0": 48}.get(result.pcie_generation, 24)
    print(f"  Bandwidth: ~{pcie_bandwidth} GB/s effective")

    print(f"\n{Colors.CYAN}Performance Estimate:{Colors.RESET}")
    print(f"  Token speed: ~{result.estimated_token_speed:.1f} tokens/second")
    if result.speed_vs_full_gpu >= 0.8:
        print(f"  Speed ratio: {Colors.ok(f'{result.speed_vs_full_gpu:.1%} of full GPU')}")
    elif result.speed_vs_full_gpu >= 0.3:
        print(f"  Speed ratio: {Colors.warning(f'{result.speed_vs_full_gpu:.1%} of full GPU')}")
    else:
        print(f"  Speed ratio: {Colors.error(f'{result.speed_vs_full_gpu:.1%} of full GPU')}")

    # Print layer offload details
    offload = result.offload_config
    print(f"\n{Colors.CYAN}Layer Distribution:{Colors.RESET}")
    print(f"  Layers on GPU:  {Colors.ok(str(offload.layers_on_gpu))}")
    print(f"  Layers on CPU:  {Colors.warning(str(offload.layers_on_cpu)) if offload.layers_on_cpu > 0 else Colors.dim(str(offload.layers_on_cpu))}")
    print(f"  Offload Ratio:  {offload.offload_ratio:.1%}")
    print(f"  GPU VRAM used:  {offload.gpu_vram_used:.2f} GB")
    print(f"  CPU RAM used:   {offload.cpu_ram_used:.2f} GB")

    if offload.status == "partial_offload":
        print(f"\n{Colors.DIM}Note: Layers on CPU are accessed via PCIe, which is slower than GPU VRAM.{Colors.RESET}")
        print(f"{Colors.DIM}      Consider quantization (INT4) to fit more layers on GPU.{Colors.RESET}")

    print("=" * 70)




[documentos]
def print_multi_gpu_result(result, model: LLMModel):
    """Prints multi-GPU configuration result.

    Imprime resultado da configuração multi-GPU.

    Args:
        result: MultiGPUResult from MultiGPUCalculator
        model: LLM model
    """
    print("\n" + "=" * 70)
    print(f"{Colors.BOLD}MULTI-GPU CONFIGURATION{Colors.RESET}")
    print("=" * 70)

    print(f"\n{Colors.CYAN}Model:{Colors.RESET} {model.name} ({model.params_billion}B parameters)")

    status_text = Colors.ok('RUNS') if result.status == 'runs' else Colors.error("DOESN'T RUN")
    print(f"{Colors.CYAN}Status:{Colors.RESET} {status_text}")

    if result.bottleneck_gpu:
        print(f"  {Colors.warning(f'Bottleneck: {result.bottleneck_gpu}')}")
    if result.communication_overhead_gb > 0:
        print(f"  Communication overhead: {result.communication_overhead_gb:.2f} GB")

    print(f"\n{Colors.CYAN}Per-GPU Allocation:{Colors.RESET}")
    for gpu_name, alloc in result.per_gpu_allocation.items():
        status = Colors.ok("✓") if alloc.vram_used_gb <= alloc.gpu.vram_gb else Colors.error("✗")
        print(f"  {status} {gpu_name:<20} {alloc.vram_used_gb:6.2f} GB / {alloc.gpu.vram_gb} GB")

        if alloc.layer_count > 0:
            print(f"      Layers: {alloc.layer_count}")
        if alloc.shard_ratio > 0:
            print(f"      Shard: {alloc.shard_ratio:.1%}")

    print(f"\n{Colors.CYAN}Framework Configuration:{Colors.RESET}")
    for framework, config in result.recommended_framework_config.items():
        print(f"  {framework}: {config}")

    print("=" * 70)




[documentos]
def print_summary_by_gpu(results: List[InferenceResult]):
    """Prints summary grouped by GPU.

    Imprime resumo agrupado por GPU.

    Shows for each GPU which models it supports.
    Mostra para cada GPU quais modelos suporta.
    """
    print("\n" + "=" * 70)
    print("SUMMARY BY GPU / RESUMO POR GPU")
    print("=" * 70)

    # Group by GPU
    # Agrupar por GPU
    from collections import defaultdict

    by_gpu: dict[str, List[InferenceResult]] = defaultdict(list)
    for r in results:
        by_gpu[r.gpu_name].append(r)

    # Sort GPUs by VRAM
    # Ordenar GPUs por VRAM
    gpu_vram = {r.gpu_name: r.gpu_vram_gb for r in results}
    sorted_gpus = sorted(by_gpu.keys(), key=lambda g: gpu_vram[g])

    for gpu_name in sorted_gpus:
        gpu_results = by_gpu[gpu_name]
        runnable = [r for r in gpu_results if r.status == Status.RUNS]

        vram = gpu_results[0].gpu_vram_gb
        print(f"\n{gpu_name} ({vram} GB):")

        if runnable:
            models = sorted(set(r.model_params_billion for r in runnable))
            print(f"  ✓ Supports: / Suporta: {', '.join(f'{m}B' for m in models)}")
        else:
            print(f"  ✗ Doesn't support any listed model / Não suporta nenhum modelo listado")




[documentos]
def export_csv(results: List[InferenceResult], filepath: str):
    """Exports results to CSV.

    Exporta resultados para CSV.
    """
    with open(filepath, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow([
            "Model", "Params_B", "GPU", "GPU_VRAM_GB",
            "VRAM_Required_GB", "Status", "VRAM_Free_%", "Quantization", "Warning"
        ])
        for r in results:
            writer.writerow([
                r.model_name,
                r.model_params_billion,
                r.gpu_name,
                r.gpu_vram_gb,
                round(r.required_vram_gb, 2),
                r.status.value,
                round(r.vram_free_percent, 1),
                r.quantization.value,
                r.warning or "",
            ])
    print(f"\n✓ Results exported to: / Resultados exportados para: {filepath}")




[documentos]
def export_json(results: List[InferenceResult], filepath: str, context_tokens: int, quantization: Quantization):
    """Exports results to JSON.

    Exporta resultados para JSON.
    """
    data = {
        "context_tokens": context_tokens,
        "quantization": quantization.value,
        "results": [r.to_dict() for r in results],
    }
    with open(filepath, "w") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"\n✓ Results exported to: / Resultados exportados para: {filepath}")




[documentos]
def list_models():
    """Prints all available models.

    Imprime todos os modelos disponíveis.
    """
    models = get_all_models()
    print("\n" + "=" * 70)
    print("AVAILABLE MODELS / MODELOS DISPONÍVEIS")
    print("=" * 70)

    for model in models:
        print(f"\n  [{model.params_billion}B] {model.name}")
        print(f"     Architecture: {model.architecture}")
        print(f"     Default precision: {model.precision_default}")
        print(f"     KV cache: {model.kv_cache_mb_per_token} MB/token (FP16 baseline)")

    print("\n" + "=" * 70)
    print("\nUsage: python main.py --model <size>  (e.g., --model 7)")




[documentos]
def print_model_vram_breakdown(model: LLMModel, context_tokens: int, quantization: Quantization, calculation_mode: CalculationMode = CalculationMode.CONSERVATIVE):
    """Prints detailed VRAM breakdown for a specific model.

    Imprime breakdown detalhado de VRAM para um modelo específico.

    Args:
        model: LLM model to analyze
        context_tokens: Context size in tokens
        quantization: Quantization type
        calculation_mode: Calculation mode
    """
    from calculator import VRAMCalculator, BYTES_PER_PARAM, KV_CACHE_MULTIPLIER

    calc = VRAMCalculator(quantization=quantization, calculation_mode=calculation_mode)
    breakdown = calc.calculate_total_vram(model, context_tokens)

    # Calculate real-world estimates
    # Idle: model loaded, no active generation
    # Peak: during token generation (KV cache fully allocated)
    idle_estimate = breakdown.params_memory_gb + breakdown.overhead_gb
    peak_estimate = breakdown.total_vram_gb

    # Determine if close to 24GB limit for warning color
    is_tight_24gb = 22 <= breakdown.total_vram_gb <= 24

    print("\n" + "=" * 70)
    print(f"{Colors.BOLD}VRAM BREAKDOWN: {model.name}{Colors.RESET}")
    print("=" * 70)
    print(f"\n{Colors.CYAN}Configuration:{Colors.RESET}")
    print(f"  Batch size:           1 (inference only)")
    print(f"  Context:              {context_tokens:,} tokens")
    print(f"  Quantization backend: {Colors.bold(quantization.value.upper())} ({BYTES_PER_PARAM[quantization]} bytes/param)")
    print(f"  KV cache precision:   FP16 (default) | Quantized (experimental)")
    print(f"  Calculation mode:     {calculation_mode.value}")
    print(f"  Memory allocator:     PyTorch-style (HF Transformers, vLLM)")

    print(f"\n{Colors.CYAN}Memory Breakdown:{Colors.RESET}")
    print(f"  Model parameters:     {breakdown.params_memory_gb:.2f} GB")
    print(f"  Overhead (30%):       {Colors.dim(f'{breakdown.overhead_gb:.2f} GB')}")
    print(f"  Model + overhead:     {breakdown.model_with_overhead_gb:.2f} GB")
    print(f"  KV cache (FP16):      {Colors.warning(f'{breakdown.kv_cache_gb:.2f} GB')} ({calculation_mode.value} mode)")
    print(f"  " + "-" * 40)
    print(f"  {Colors.BOLD}TOTAL VRAM:{Colors.RESET:15} {Colors.bold(f'{breakdown.total_vram_gb:.2f} GB')}")

    print(f"\n{Colors.CYAN}Real-World Usage Estimates:{Colors.RESET}")
    print(f"  Idle (model loaded):      {idle_estimate:.2f} GB")
    print(f"  Peak (generation):        {Colors.warning(f'{peak_estimate:.2f} GB')}")

    print(f"\nMinimum GPU VRAM required: {breakdown.total_vram_gb:.1f} GB")
    print(f"Recommended (with margin): {Colors.ok(f'{breakdown.total_vram_gb * 1.1:.1f} GB')}")

    # Show assumptions for production mode
    print(f"\n{Colors.DIM}  Assumptions:{Colors.RESET}")
    print(f"    • batch_size = 1 (no batching)")
    print(f"    • No LoRA adapters active")
    print(f"    • No speculative decoding")
    print(f"    • No tool calling overhead")
    print(f"    • PyTorch allocator (TensorRT-LLM / llama.cpp may vary)")
    print(f"    • KV cache in FP16 (quantized KV cache is experimental)")
    print(f"      → Weights INT4 ≠ KV cache INT4 in most frameworks")

    print(f"\n{Colors.DIM}  Scaling notes:{Colors.RESET}")
    print(f"    • KV cache scales linearly with context length")
    print(f"      → 16k context ≈ {breakdown.kv_cache_gb * 2:.1f} GB KV cache")
    print(f"      → 32k context ≈ {breakdown.kv_cache_gb * 4:.1f} GB KV cache")
    print(f"    • KV cache scales linearly with batch size")
    print(f"      → batch_size = 4 ≈ +{breakdown.kv_cache_gb * 3:.1f} GB KV cache")
    print(f"    • VRAM calculations do not account for throughput or latency")
    print(f"      → This tool measures {Colors.bold('capacity')}, not speed")

    # Warning for 24GB GPUs near limit
    if is_tight_24gb:
        print(f"\n  {Colors.BG_RED}{Colors.WHITE} ⚠️  WARNING: 24GB GPUs run at the limit.{Colors.RESET}")
        print(f"     Any batching, adapters (LoRA), or additional features may cause OOM.")

    print("=" * 70)




[documentos]
def estimate_kv_cache(params_billion: int) -> float:
    """Estimate KV cache per token based on model size.

    Estima KV cache por token baseado no tamanho do modelo.

    Uses a conservative formula based on decoder-only architecture.
    Usa uma fórmula conservadora baseada em arquitetura decoder-only.

    Args:
        params_billion: Model size in billions of parameters

    Returns:
        Estimated KV cache in MB per token (FP16)
    """
    # Approximate KV cache scaling based on model size
    # Escalonamento aproximado de KV cache baseado no tamanho do modelo
    # Formula: kv_cache ≈ 0.6 * sqrt(params_billion / 7)
    # This is a rough approximation for decoder-only models
    if params_billion <= 1:
        return 0.05
    elif params_billion <= 3:
        return 0.15
    elif params_billion <= 7:
        return 0.4
    elif params_billion <= 13:
        return 0.6
    elif params_billion <= 30:
        return 1.0
    elif params_billion <= 70:
        return 2.0
    elif params_billion <= 100:
        return 3.0
    else:
        # For very large models, KV cache grows roughly with sqrt of params
        return 3.0 * (params_billion / 100) ** 0.5




[documentos]
def parse_args():
    """Parse CLI arguments.

    Parse argumentos da CLI.
    """
    parser = argparse.ArgumentParser(
        description="LLM Local Inference Viability Calculator / "
                    "Calculadora de viabilidade de inferência local de LLMs",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples / Exemplos:
  python main.py --context 4096
  python main.py -c 8192 --gpu-type consumer
  python main.py -c 4096 --only-runs --export-json results.json
  python main.py -c 16384 --group-gpu
  python main.py -c 8192 --quantization int4
  python main.py --list-models
  python main.py --model 7 --context 8192
  python main.py -m 70 -c 16384 -q int4
  python main.py -m 70 -c 8192 -q int4 --mode production

Generic model / Modelo genérico:
  python main.py --params-b 405 --context 8192 --quantization int4
  python main.py --params-b 405 --kv-cache 15.0 --context 8192
  python main.py --params-b 405 --model-name "Llama 3.1 405B" -c 8192

Available precisions / Precisões disponíveis:
  fp32 - Float32 (4 bytes/param) - Original precision, highest quality
  fp16 - Float16 (2 bytes/param) - Half VRAM, excellent quality
  int8 - Int8 (1 byte/param) - Quarter VRAM, small quality loss
  int4 - Int4 (0.5 byte/param) - Eighth VRAM, noticeable quality loss

Calculation modes / Modos de cálculo:
  theoretical - Ideal minimum (batch=1, no padding/alignment)
  conservative - Default mode with 10%% buffer (minimal overhead)
  production  - Real-world serving (batch>1, fragmentation) with 25%% buffer
        """,
    )

    parser.add_argument(
        "-c", "--context",
        type=int,
        default=4096,
        help="Context size in tokens / Tamanho do contexto em tokens (default: 4096)",
    )

    parser.add_argument(
        "--list-models",
        action="store_true",
        help="List all available models / Listar todos os modelos disponíveis",
    )

    parser.add_argument(
        "-m", "--model",
        type=float,
        metavar="SIZE",
        help="Model size in billions of parameters (e.g., 0.6, 7, 13, 70) / "
             "Tamanho do modelo em bilhões de parâmetros",
    )

    parser.add_argument(
        "--gpu-type",
        choices=["consumer", "datacenter", "all"],
        default="all",
        help="GPU type to consider / Tipo de GPU a considerar (default: all)",
    )

    parser.add_argument(
        "--only-runs",
        action="store_true",
        help="Show only running combinations / Mostrar apenas combinações que rodam",
    )

    parser.add_argument(
        "--group-gpu",
        action="store_true",
        help="Group results by GPU instead of model / "
             "Agrupar resultados por GPU em vez de por modelo",
    )

    parser.add_argument(
        "--summary",
        choices=["model", "gpu", "both", "none"],
        default="both",
        help="Summary type to show / Tipo de resumo a mostrar (default: both)",
    )

    parser.add_argument(
        "--export-csv",
        metavar="FILE",
        help="Export results to CSV / Exportar resultados para CSV",
    )

    parser.add_argument(
        "--export-json",
        metavar="FILE",
        help="Export results to JSON / Exportar resultados para JSON",
    )

    parser.add_argument(
        "-q", "--quantization",
        choices=["fp32", "fp16", "int8", "int4"],
        default="fp16",
        help="Model precision/quantization / Precisão do modelo (fp32, fp16, int8, int4)",
    )

    parser.add_argument(
        "--mode",
        choices=["theoretical", "conservative", "production"],
        default="conservative",
        help="Calculation mode / Modo de cálculo "
             "(theoretical=ideal minimum, conservative=default, production=real-world serving)",
    )

    # Generic model parameters / Parâmetros de modelo genérico
    parser.add_argument(
        "--params-b",
        type=int,
        metavar="BILLIONS",
        help="Generic model: parameters in billions (e.g., 8, 70, 405) / "
             "Modelo genérico: parâmetros em bilhões (ex: 8, 70, 405)",
    )

    parser.add_argument(
        "--kv-cache",
        type=float,
        metavar="MB_PER_TOKEN",
        help="Generic model: KV cache in MB per token FP16 (e.g., 0.6, 1.0, 4.27) / "
             "Modelo genérico: KV cache em MB por token FP16 (ex: 0.6, 1.0, 4.27)",
    )

    parser.add_argument(
        "--model-name",
        type=str,
        metavar="NAME",
        help="Generic model: custom name for display / "
             "Modelo genérico: nome personalizado para exibição",
    )

    # -----------------------------------------------------------------------
    # NEW: Advanced configuration options
    # NOVOS: Opções de configuração avançadas
    # -----------------------------------------------------------------------

    parser.add_argument(
        "--optimize-config",
        action="store_true",
        help="Show optimal layer offload configuration / "
             "Mostrar configuração ótima de offload de camadas",
    )

    parser.add_argument(
        "--cpu-offload",
        action="store_true",
        help="Enable CPU offload calculations / "
             "Habilitar cálculos de offload de CPU",
    )

    parser.add_argument(
        "--system-ram",
        type=float,
        default=32.0,
        metavar="GB",
        help="System RAM available in GB (for CPU offload) / "
             "RAM do sistema disponível em GB (para offload de CPU) (default: 32.0)",
    )

    parser.add_argument(
        "--pcie-gen",
        choices=["3.0", "4.0", "5.0"],
        default="4.0",
        help="PCIe generation for bandwidth estimation / "
             "Geração PCIe para estimativa de largura de banda (default: 4.0)",
    )

    parser.add_argument(
        "--multi-gpu",
        action="store_true",
        help="Enable multi-GPU mode / "
             "Habilitar modo multi-GPU",
    )

    parser.add_argument(
        "--gpu-config",
        type=str,
        metavar="CONFIG",
        help="Multi-GPU configuration (e.g., '2x4090,1x3090') / "
             "Configuração multi-GPU (ex: '2x4090,1x3090')",
    )

    parser.add_argument(
        "--multi-gpu-mode",
        choices=["tensor", "pipeline"],
        default="tensor",
        help="Multi-GPU parallelism mode / "
             "Modo de paralelismo multi-GPU (default: tensor)",
    )

    parser.add_argument(
        "--gguf-file",
        type=str,
        metavar="FILENAME",
        help="GGUF filename to auto-detect quantization / "
             "Nome de arquivo GGUF para auto-detectar quantização",
    )

    parser.add_argument(
        "--format",
        choices=["fp16", "gguf", "exl2", "gptq", "awq"],
        default="fp16",
        help="Model format for overhead calculation / "
             "Formato do modelo para cálculo de overhead (default: fp16)",
    )

    return parser.parse_args()




[documentos]
def main():
    """Main CLI function.

    Função principal da CLI.
    """
    args = parse_args()

    # Handle --list-models
    if args.list_models:
        list_models()
        sys.exit(0)

    # Validate context
    # Validar contexto
    if args.context <= 0:
        print("Error: context_tokens must be positive / Erro: context_tokens deve ser positivo",
              file=sys.stderr)
        sys.exit(1)

    # Map quantization (needed early for GGUF detection)
    # Mapear quantização (necessário cedo para detecção GGUF)
    quant_map = {
        "fp32": Quantization.FP32,
        "fp16": Quantization.FP16,
        "int8": Quantization.INT8,
        "int4": Quantization.INT4,
    }
    quantization = quant_map[args.quantization]

    # Map calculation mode
    # Mapear modo de cálculo
    mode_map = {
        "theoretical": CalculationMode.THEORETICAL,
        "conservative": CalculationMode.CONSERVATIVE,
        "production": CalculationMode.PRODUCTION,
    }
    calculation_mode = mode_map[args.mode]

    # Map model format
    # Mapear formato do modelo
    format_map = {
        "fp16": ModelFormat.FP16,
        "gguf": ModelFormat.GGUF,
        "exl2": ModelFormat.EXL2,
        "gptq": ModelFormat.GPTQ,
        "awq": ModelFormat.AWQ,
    }
    model_format = format_map[args.format]

    # Handle GGUF auto-detection
    # Lidar com auto-detecção GGUF
    if args.gguf_file:
        gguf_info = detect_gguf_quantization(args.gguf_file)
        if gguf_info.is_gguf:
            print(f"\n{Colors.info('GGUF file detected:')} {args.gguf_file}")
            print(f"  Quantization: {gguf_info.quant_name}")
            print(f"  Effective bits: {gguf_info.bits_per_param:.2f} bits/param")
            # Update quantization based on GGUF detection
            if gguf_info.quant_type != "fp16":
                quantization = quant_map[gguf_info.quant_type]
                print(f"  Using quantization: {quantization.value}")
            model_format = ModelFormat.GGUF

    # -----------------------------------------------------------------------
    # Handle advanced configuration modes
    # Lidar com modos de configuração avançados
    # -----------------------------------------------------------------------

    # Multi-GPU mode
    # Modo multi-GPU
    if args.multi_gpu and args.gpu_config:
        try:
            multi_gpus = parse_gpu_config_string(args.gpu_config, get_all_gpus())

            # Get or create model
            model = None
            if args.params_b is not None:
                kv_cache = args.kv_cache if args.kv_cache else estimate_kv_cache(args.params_b)
                model_name = args.model_name if args.model_name else f"Custom Model {args.params_b}B"
                model = LLMModel(
                    name=model_name,
                    params_billion=args.params_b,
                    architecture="decoder-only",
                    precision_default="fp16",
                    kv_cache_mb_per_token=kv_cache,
                    format=model_format,
                )
            elif args.model:
                model = get_model_by_size(args.model)
                if not model:
                    kv_cache = estimate_kv_cache(args.model)
                    model = LLMModel(
                        name=f"Generic Model {args.model}B",
                        params_billion=args.model,
                        architecture="decoder-only",
                        precision_default="fp16",
                        kv_cache_mb_per_token=kv_cache,
                        format=model_format,
                    )

            if not model:
                print("Error: Please specify --model or --params-b with --multi-gpu", file=sys.stderr)
                sys.exit(1)

            # Create multi-GPU config
            mode_map_gpu = {"tensor": MultiGPUMode.TENSOR_PARALLEL, "pipeline": MultiGPUMode.PIPELINE_PARALLEL}
            multi_gpu_mode = mode_map_gpu[args.multi_gpu_mode]
            config = MultiGPUConfig(gpus=multi_gpus, mode=multi_gpu_mode)

            # Calculate
            calc = MultiGPUCalculator(quantization=quantization, model_format=model_format)
            result = calc.calculate(model, config, args.context)

            # Print result
            print_multi_gpu_result(result, model)
            sys.exit(0)

        except ValueError as e:
            print(f"Error parsing GPU config: {e}", file=sys.stderr)
            sys.exit(1)

    # Handle --model (specific model) or --params-b (generic model)
    # Lidar com --model (modelo específico) ou --params-b (modelo genérico)
    model = None
    use_generic = False

    # Generic model takes precedence / Modelo genérico tem precedência
    if args.params_b is not None:
        # Create generic model / Criar modelo genérico
        params_b = args.params_b
        kv_cache = args.kv_cache if args.kv_cache else estimate_kv_cache(params_b)
        model_name = args.model_name if args.model_name else f"Custom Model {params_b}B"

        model = LLMModel(
            name=model_name,
            params_billion=params_b,
            architecture="decoder-only",
            precision_default="fp16",
            kv_cache_mb_per_token=kv_cache,
        )
        use_generic = True

    elif args.model:
        model = get_model_by_size(args.model)
        if not model:
            # Model not found in database - offer to use as generic
            # Modelo não encontrado no banco - oferecer usar como genérico
            print(f"\n{Colors.warn('Model size ' + str(args.model) + 'B not found in database.')}")
            print(f"{Colors.dim('Using generic model with estimated KV cache.')}")
            print(f"Use --params-b {args.model} --kv-cache <value> for custom KV cache.")
            print(f"Or use --list-models to see all available models.\n")

            # Create generic model as fallback / Criar modelo genérico como fallback
            kv_cache = args.kv_cache if args.kv_cache else estimate_kv_cache(args.model)
            model = LLMModel(
                name=f"Generic Model {args.model}B",
                params_billion=args.model,
                architecture="decoder-only",
                precision_default="fp16",
                kv_cache_mb_per_token=kv_cache,
            )
            use_generic = True

    if model:
        # Show VRAM breakdown for the specific model
        print_model_vram_breakdown(model, args.context, quantization, calculation_mode)

        # Determine which GPU to use for advanced calculations
        target_gpu = None
        if args.optimize_config or args.cpu_offload:
            # Use largest GPU or allow user to specify via --gpu flag
            # For now, use the largest available GPU
            gpus = get_all_gpus()
            target_gpu = max(gpus, key=lambda g: g.vram_gb)

            # For CPU offload, get target GPU from user or default to largest
            if args.cpu_offload:
                cpu_calc = CPUOffloadCalculator(
                    quantization=quantization,
                    system_ram_gb=args.system_ram,
                    pcie_generation=args.pcie_gen,
                    model_format=model_format,
                )
                cpu_result = cpu_calc.calculate_offload(model, target_gpu, args.context)
                print_cpu_offload_result(cpu_result, model)
                sys.exit(0)

            # For layer offload optimization
            if args.optimize_config:
                layer_calc = LayerOffloadCalculator(
                    quantization=quantization,
                    model_format=model_format,
                )
                layer_result = layer_calc.calculate_optimal_offload(model, target_gpu, args.context)
                print_layer_offload_result(layer_result, model, target_gpu)

                # Show offload options for all GPUs
                print(f"\n{Colors.CYAN}Offload Options for All GPUs:{Colors.RESET}")
                for gpu in sorted(get_all_gpus(), key=lambda g: g.vram_gb, reverse=True):
                    result = layer_calc.calculate_optimal_offload(model, gpu, args.context)
                    status_color = Colors.ok if result.status == "full_gpu" else Colors.warning
                    print(f"  {gpu.name:<25} ({gpu.vram_gb:3} GB): "
                          f"{result.layers_on_gpu}/{result.total_layers} layers on GPU "
                          f"({status_color(result.status)}){Colors.RESET}")

                print("\n" + "=" * 70)
                sys.exit(0)

        # Show which GPUs can run this model
        gpus = get_all_gpus()
        calculator = VRAMCalculator(quantization=quantization, calculation_mode=calculation_mode)
        results = []
        for gpu in gpus:
            result = calculator.evaluate_pair(model, gpu, args.context)
            results.append(result)

        print(f"\nGPU COMPATIBILITY ({args.context:,} tokens, {quantization.value.upper()}):")
        print("-" * 70)

        runnable = [r for r in results if r.status == Status.RUNS]
        not_runnable = [r for r in results if r.status != Status.RUNS]

        if runnable:
            print(f"\n✓ RUNS on these GPUs:")
            for r in sorted(runnable, key=lambda x: x.gpu_vram_gb):
                free_pct = r.vram_free_percent
                print(f"  {r.gpu_name:<25} ({r.gpu_vram_gb:3} GB) - "
                      f"{free_pct:4.1f}% free")

        if not_runnable:
            print(f"\n✗ DOESN'T RUN - needs more VRAM:")
            sorted_by_need = sorted(not_runnable, key=lambda x: -(x.required_vram_gb - x.gpu_vram_gb))[:5]
            for r in sorted_by_need:
                print(f"  {r.gpu_name:<25} ({r.gpu_vram_gb:3} GB) - "
                      f"needs {r.required_vram_gb:.1f} GB")

            # Suggest layer offload option
            print(f"\n{Colors.info('💡 Tip: Use --optimize-config to see layer offload options')}")
            print(f"{Colors.dim('   Some layers can run on GPU while others use system RAM.')}")

        print("\n" + "=" * 70)
        sys.exit(0)

    # Original flow: show all combinations
    # Fluxo original: mostrar todas as combinações

    # Quantization info
    # Info sobre quantização
    bytes_per_param = quantization.bytes_per_param
    kv_mult = quantization.kv_cache_multiplier
    if args.quantization != "fp16":
        print(
            f"\nℹ️  Using {args.quantization.upper()}: / Usando {args.quantization.upper()}: "
            f"{bytes_per_param} bytes/param, "
            f"KV cache ×{kv_mult}"
        )

    # Select GPUs
    # Selecionar GPUs
    if args.gpu_type == "consumer":
        gpus = get_consumer_gpus()
        gpu_type_label = "Consumer"
    elif args.gpu_type == "datacenter":
        gpus = get_datacenter_gpus()
        gpu_type_label = "Datacenter"
    else:
        gpus = get_all_gpus()
        gpu_type_label = "All / Todas"

    # Select models
    # Selecionar modelos
    models = get_all_models()

    # Calculate
    # Calcular
    calculator = VRAMCalculator(quantization=quantization, calculation_mode=calculation_mode)
    results = []
    for model in models:
        for gpu in gpus:
            result = calculator.evaluate_pair(model, gpu, args.context)
            results.append(result)

    # Header
    print("\n" + "=" * 70)
    print("LLM LOCAL INFERENCE VIABILITY CALCULATOR")
    print("CALCULADORA DE VIABILIDADE DE INFERÊNCIA LOCAL DE LLMs")
    print("=" * 70)
    print(f"\nConfiguration / Configuração:")
    print(f"  • Batch size: 1 (inference)")
    print(f"  • Context: / Contexto: {args.context:,} tokens")
    print(f"  • Quantization: / Quantização: {Colors.bold(args.quantization.upper())}")
    print(f"  • Mode: / Modo: {args.mode}")
    print(f"  • GPUs: {gpu_type_label} ({len(gpus)} models / modelos)")
    print(f"  • LLM Models: / Modelos LLM: {len(models)} sizes / tamanhos")

    # Main table
    # Tabela principal
    print_table(
        results,
        group_by_gpu=args.group_gpu,
        show_only_runs=args.only_runs,
    )

    # Summaries
    # Resumos
    if args.summary in ("model", "both"):
        print_summary_by_model(results)

    if args.summary in ("gpu", "both"):
        print_summary_by_gpu(results)

    # Exports
    # Exportações
    if args.export_csv:
        export_csv(results, args.export_csv)

    if args.export_json:
        export_json(results, args.export_json, args.context, quantization)

    # Warning for 24GB GPUs running near limit
    # Aviso para GPUs de 24GB rodando no limite
    tight_24gb = [
        r for r in results
        if r.status == Status.RUNS
        and r.gpu_vram_gb == 24
        and 22 <= r.required_vram_gb <= 24
    ]
    if tight_24gb:
        print("\n" + "⚠️  " * 12)
        print(f"\n{Colors.BOLD}NOTICE: 24GB GPUs running at the limit:{Colors.RESET}")
        print(f"  {Colors.warning('• Any batching (batch_size > 1) may cause OOM')}")
        print(f"  {Colors.warning('• LoRA adapters add ~0.5-2GB per adapter')}")
        print(f"  {Colors.warning('• Speculative decoding adds ~30-50% memory')}")
        print(f"  {Colors.warning('• Tool calling / function calling adds overhead')}")
        print(f"\n{Colors.DIM}  Assumptions for calculations above:{Colors.RESET}")
        print(f"    Memory model: PyTorch allocator (HF Transformers, vLLM)")
        print(f"    KV cache: FP16 (quantized KV is experimental/exclusive)")
        print(f"    batch_size = 1 (no batching)")
        print(f"    No LoRA adapters active")
        print(f"    No speculative decoding")
        print(f"    No tool calling overhead")
        print(f"  {Colors.DIM}Note: TensorRT-LLM, llama.cpp, EXL2 may have different behavior{Colors.RESET}")
        print()

    print("\n" + "=" * 70)
    print()



if __name__ == "__main__":
    main()