Código-fonte para main

#!/usr/bin/env python3
"""
CLI for LLM local inference viability calculator.

Allows quickly discovering which models run on which GPU
for a given context size.

CLI para calculadora de viabilidade de inferência local de LLMs.
Permite descobrir rapidamente quais modelos rodam em qual GPU
para um determinado tamanho de contexto.
"""

import argparse
import csv
import json
import sys
from typing import List

from models import LLMModel, get_all_models, get_model_by_size
from gpus import GPU, get_all_gpus, get_consumer_gpus, get_datacenter_gpus, get_gpu_by_name
from calculator import (
    VRAMCalculator, Quantization, InferenceResult, Status, CalculationMode,
    LayerOffloadCalculator, LayerOffloadResult,
    CPUOffloadCalculator, CPUOffloadResult,
)
from formats import ModelFormat, detect_gguf_quantization, get_format_from_filename
from multi_gpu import (
    MultiGPUConfig, MultiGPUCalculator, MultiGPUMode,
    parse_gpu_config_string, create_multi_gpu_config,
)


# Terminal colors for enhanced readability
# Cores de terminal para melhor legibilidade
[documentos] class Colors: """ANSI color codes for terminal output. Códigos de cores ANSI para saída de terminal. """ RESET = "\033[0m" BOLD = "\033[1m" DIM = "\033[2m" # Foreground colors BLACK = "\033[30m" RED = "\033[31m" GREEN = "\033[32m" YELLOW = "\033[33m" BLUE = "\033[34m" MAGENTA = "\033[35m" CYAN = "\033[36m" WHITE = "\033[37m" # Background colors BG_RED = "\033[41m" BG_GREEN = "\033[42m" BG_YELLOW = "\033[43m" BG_BLUE = "\033[44m"
[documentos] @staticmethod def ok(text: str) -> str: """Green text for success/OK messages.""" return f"{Colors.GREEN}{text}{Colors.RESET}"
[documentos] @staticmethod def warning(text: str) -> str: """Yellow text for warnings.""" return f"{Colors.YELLOW}{text}{Colors.RESET}"
[documentos] @staticmethod def warn(text: str) -> str: """Yellow text for warnings (alias).""" return f"{Colors.YELLOW}{text}{Colors.RESET}"
[documentos] @staticmethod def error(text: str) -> str: """Red text for errors.""" return f"{Colors.RED}{text}{Colors.RESET}"
[documentos] @staticmethod def info(text: str) -> str: """Blue text for info.""" return f"{Colors.BLUE}{text}{Colors.RESET}"
[documentos] @staticmethod def dim(text: str) -> str: """Dim text for less emphasis.""" return f"{Colors.DIM}{text}{Colors.RESET}"
[documentos] @staticmethod def bold(text: str) -> str: """Bold text.""" return f"{Colors.BOLD}{text}{Colors.RESET}"
def print_summary_by_gpu(results: List[InferenceResult]): """Prints summary grouped by GPU. Imprime resumo agrupado por GPU. Shows for each GPU which models it supports. Mostra para cada GPU quais modelos suporta. """ print("\n" + "=" * 70) print("SUMMARY BY GPU / RESUMO POR GPU") print("=" * 70) # Group by GPU # Agrupar por GPU from collections import defaultdict by_gpu: dict[str, List[InferenceResult]] = defaultdict(list) for r in results: by_gpu[r.gpu_name].append(r) # Sort GPUs by VRAM # Ordenar GPUs por VRAM gpu_vram = {r.gpu_name: r.gpu_vram_gb for r in results} sorted_gpus = sorted(by_gpu.keys(), key=lambda g: gpu_vram[g]) for gpu_name in sorted_gpus: gpu_results = by_gpu[gpu_name] runnable = [r for r in gpu_results if r.status == Status.RUNS] vram = gpu_results[0].gpu_vram_gb print(f"\n{gpu_name} ({vram} GB):") if runnable: models = sorted(set(r.model_params_billion for r in runnable)) print(f" ✓ Supports: / Suporta: {', '.join(f'{m}B' for m in models)}") else: print(f" ✗ Doesn't support any listed model / Não suporta nenhum modelo listado")
[documentos] def export_csv(results: List[InferenceResult], filepath: str): """Exports results to CSV. Exporta resultados para CSV. """ with open(filepath, "w", newline="") as f: writer = csv.writer(f) writer.writerow([ "Model", "Params_B", "GPU", "GPU_VRAM_GB", "VRAM_Required_GB", "Status", "VRAM_Free_%", "Quantization", "Warning" ]) for r in results: writer.writerow([ r.model_name, r.model_params_billion, r.gpu_name, r.gpu_vram_gb, round(r.required_vram_gb, 2), r.status.value, round(r.vram_free_percent, 1), r.quantization.value, r.warning or "", ]) print(f"\n✓ Results exported to: / Resultados exportados para: {filepath}")
[documentos] def export_json(results: List[InferenceResult], filepath: str, context_tokens: int, quantization: Quantization): """Exports results to JSON. Exporta resultados para JSON. """ data = { "context_tokens": context_tokens, "quantization": quantization.value, "results": [r.to_dict() for r in results], } with open(filepath, "w") as f: json.dump(data, f, indent=2, ensure_ascii=False) print(f"\n✓ Results exported to: / Resultados exportados para: {filepath}")
[documentos] def list_models(): """Prints all available models. Imprime todos os modelos disponíveis. """ models = get_all_models() print("\n" + "=" * 70) print("AVAILABLE MODELS / MODELOS DISPONÍVEIS") print("=" * 70) for model in models: print(f"\n [{model.params_billion}B] {model.name}") print(f" Architecture: {model.architecture}") print(f" Default precision: {model.precision_default}") print(f" KV cache: {model.kv_cache_mb_per_token} MB/token (FP16 baseline)") print("\n" + "=" * 70) print("\nUsage: python main.py --model <size> (e.g., --model 7)")
[documentos] def estimate_kv_cache(params_billion: int) -> float: """Estimate KV cache per token based on model size. Estima KV cache por token baseado no tamanho do modelo. Uses a conservative formula based on decoder-only architecture. Usa uma fórmula conservadora baseada em arquitetura decoder-only. Args: params_billion: Model size in billions of parameters Returns: Estimated KV cache in MB per token (FP16) """ # Approximate KV cache scaling based on model size # Escalonamento aproximado de KV cache baseado no tamanho do modelo # Formula: kv_cache ≈ 0.6 * sqrt(params_billion / 7) # This is a rough approximation for decoder-only models if params_billion <= 1: return 0.05 elif params_billion <= 3: return 0.15 elif params_billion <= 7: return 0.4 elif params_billion <= 13: return 0.6 elif params_billion <= 30: return 1.0 elif params_billion <= 70: return 2.0 elif params_billion <= 100: return 3.0 else: # For very large models, KV cache grows roughly with sqrt of params return 3.0 * (params_billion / 100) ** 0.5
[documentos] def parse_args(): """Parse CLI arguments. Parse argumentos da CLI. """ parser = argparse.ArgumentParser( description="LLM Local Inference Viability Calculator / " "Calculadora de viabilidade de inferência local de LLMs", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples / Exemplos: python main.py --context 4096 python main.py -c 8192 --gpu-type consumer python main.py -c 4096 --only-runs --export-json results.json python main.py -c 16384 --group-gpu python main.py -c 8192 --quantization int4 python main.py --list-models python main.py --model 7 --context 8192 python main.py -m 70 -c 16384 -q int4 python main.py -m 70 -c 8192 -q int4 --mode production Generic model / Modelo genérico: python main.py --params-b 405 --context 8192 --quantization int4 python main.py --params-b 405 --kv-cache 15.0 --context 8192 python main.py --params-b 405 --model-name "Llama 3.1 405B" -c 8192 Available precisions / Precisões disponíveis: fp32 - Float32 (4 bytes/param) - Original precision, highest quality fp16 - Float16 (2 bytes/param) - Half VRAM, excellent quality int8 - Int8 (1 byte/param) - Quarter VRAM, small quality loss int4 - Int4 (0.5 byte/param) - Eighth VRAM, noticeable quality loss Calculation modes / Modos de cálculo: theoretical - Ideal minimum (batch=1, no padding/alignment) conservative - Default mode with 10%% buffer (minimal overhead) production - Real-world serving (batch>1, fragmentation) with 25%% buffer """, ) parser.add_argument( "-c", "--context", type=int, default=4096, help="Context size in tokens / Tamanho do contexto em tokens (default: 4096)", ) parser.add_argument( "--list-models", action="store_true", help="List all available models / Listar todos os modelos disponíveis", ) parser.add_argument( "-m", "--model", type=float, metavar="SIZE", help="Model size in billions of parameters (e.g., 0.6, 7, 13, 70) / " "Tamanho do modelo em bilhões de parâmetros", ) parser.add_argument( "--gpu-type", choices=["consumer", "datacenter", "all"], default="all", help="GPU type to consider / Tipo de GPU a considerar (default: all)", ) parser.add_argument( "--only-runs", action="store_true", help="Show only running combinations / Mostrar apenas combinações que rodam", ) parser.add_argument( "--group-gpu", action="store_true", help="Group results by GPU instead of model / " "Agrupar resultados por GPU em vez de por modelo", ) parser.add_argument( "--summary", choices=["model", "gpu", "both", "none"], default="both", help="Summary type to show / Tipo de resumo a mostrar (default: both)", ) parser.add_argument( "--export-csv", metavar="FILE", help="Export results to CSV / Exportar resultados para CSV", ) parser.add_argument( "--export-json", metavar="FILE", help="Export results to JSON / Exportar resultados para JSON", ) parser.add_argument( "-q", "--quantization", choices=["fp32", "fp16", "int8", "int4"], default="fp16", help="Model precision/quantization / Precisão do modelo (fp32, fp16, int8, int4)", ) parser.add_argument( "--mode", choices=["theoretical", "conservative", "production"], default="conservative", help="Calculation mode / Modo de cálculo " "(theoretical=ideal minimum, conservative=default, production=real-world serving)", ) # Generic model parameters / Parâmetros de modelo genérico parser.add_argument( "--params-b", type=int, metavar="BILLIONS", help="Generic model: parameters in billions (e.g., 8, 70, 405) / " "Modelo genérico: parâmetros em bilhões (ex: 8, 70, 405)", ) parser.add_argument( "--kv-cache", type=float, metavar="MB_PER_TOKEN", help="Generic model: KV cache in MB per token FP16 (e.g., 0.6, 1.0, 4.27) / " "Modelo genérico: KV cache em MB por token FP16 (ex: 0.6, 1.0, 4.27)", ) parser.add_argument( "--model-name", type=str, metavar="NAME", help="Generic model: custom name for display / " "Modelo genérico: nome personalizado para exibição", ) # ----------------------------------------------------------------------- # NEW: Advanced configuration options # NOVOS: Opções de configuração avançadas # ----------------------------------------------------------------------- parser.add_argument( "--optimize-config", action="store_true", help="Show optimal layer offload configuration / " "Mostrar configuração ótima de offload de camadas", ) parser.add_argument( "--cpu-offload", action="store_true", help="Enable CPU offload calculations / " "Habilitar cálculos de offload de CPU", ) parser.add_argument( "--system-ram", type=float, default=32.0, metavar="GB", help="System RAM available in GB (for CPU offload) / " "RAM do sistema disponível em GB (para offload de CPU) (default: 32.0)", ) parser.add_argument( "--pcie-gen", choices=["3.0", "4.0", "5.0"], default="4.0", help="PCIe generation for bandwidth estimation / " "Geração PCIe para estimativa de largura de banda (default: 4.0)", ) parser.add_argument( "--multi-gpu", action="store_true", help="Enable multi-GPU mode / " "Habilitar modo multi-GPU", ) parser.add_argument( "--gpu-config", type=str, metavar="CONFIG", help="Multi-GPU configuration (e.g., '2x4090,1x3090') / " "Configuração multi-GPU (ex: '2x4090,1x3090')", ) parser.add_argument( "--multi-gpu-mode", choices=["tensor", "pipeline"], default="tensor", help="Multi-GPU parallelism mode / " "Modo de paralelismo multi-GPU (default: tensor)", ) parser.add_argument( "--gguf-file", type=str, metavar="FILENAME", help="GGUF filename to auto-detect quantization / " "Nome de arquivo GGUF para auto-detectar quantização", ) parser.add_argument( "--format", choices=["fp16", "gguf", "exl2", "gptq", "awq"], default="fp16", help="Model format for overhead calculation / " "Formato do modelo para cálculo de overhead (default: fp16)", ) return parser.parse_args()
[documentos] def main(): """Main CLI function. Função principal da CLI. """ args = parse_args() # Handle --list-models if args.list_models: list_models() sys.exit(0) # Validate context # Validar contexto if args.context <= 0: print("Error: context_tokens must be positive / Erro: context_tokens deve ser positivo", file=sys.stderr) sys.exit(1) # Map quantization (needed early for GGUF detection) # Mapear quantização (necessário cedo para detecção GGUF) quant_map = { "fp32": Quantization.FP32, "fp16": Quantization.FP16, "int8": Quantization.INT8, "int4": Quantization.INT4, } quantization = quant_map[args.quantization] # Map calculation mode # Mapear modo de cálculo mode_map = { "theoretical": CalculationMode.THEORETICAL, "conservative": CalculationMode.CONSERVATIVE, "production": CalculationMode.PRODUCTION, } calculation_mode = mode_map[args.mode] # Map model format # Mapear formato do modelo format_map = { "fp16": ModelFormat.FP16, "gguf": ModelFormat.GGUF, "exl2": ModelFormat.EXL2, "gptq": ModelFormat.GPTQ, "awq": ModelFormat.AWQ, } model_format = format_map[args.format] # Handle GGUF auto-detection # Lidar com auto-detecção GGUF if args.gguf_file: gguf_info = detect_gguf_quantization(args.gguf_file) if gguf_info.is_gguf: print(f"\n{Colors.info('GGUF file detected:')} {args.gguf_file}") print(f" Quantization: {gguf_info.quant_name}") print(f" Effective bits: {gguf_info.bits_per_param:.2f} bits/param") # Update quantization based on GGUF detection if gguf_info.quant_type != "fp16": quantization = quant_map[gguf_info.quant_type] print(f" Using quantization: {quantization.value}") model_format = ModelFormat.GGUF # ----------------------------------------------------------------------- # Handle advanced configuration modes # Lidar com modos de configuração avançados # ----------------------------------------------------------------------- # Multi-GPU mode # Modo multi-GPU if args.multi_gpu and args.gpu_config: try: multi_gpus = parse_gpu_config_string(args.gpu_config, get_all_gpus()) # Get or create model model = None if args.params_b is not None: kv_cache = args.kv_cache if args.kv_cache else estimate_kv_cache(args.params_b) model_name = args.model_name if args.model_name else f"Custom Model {args.params_b}B" model = LLMModel( name=model_name, params_billion=args.params_b, architecture="decoder-only", precision_default="fp16", kv_cache_mb_per_token=kv_cache, format=model_format, ) elif args.model: model = get_model_by_size(args.model) if not model: kv_cache = estimate_kv_cache(args.model) model = LLMModel( name=f"Generic Model {args.model}B", params_billion=args.model, architecture="decoder-only", precision_default="fp16", kv_cache_mb_per_token=kv_cache, format=model_format, ) if not model: print("Error: Please specify --model or --params-b with --multi-gpu", file=sys.stderr) sys.exit(1) # Create multi-GPU config mode_map_gpu = {"tensor": MultiGPUMode.TENSOR_PARALLEL, "pipeline": MultiGPUMode.PIPELINE_PARALLEL} multi_gpu_mode = mode_map_gpu[args.multi_gpu_mode] config = MultiGPUConfig(gpus=multi_gpus, mode=multi_gpu_mode) # Calculate calc = MultiGPUCalculator(quantization=quantization, model_format=model_format) result = calc.calculate(model, config, args.context) # Print result print_multi_gpu_result(result, model) sys.exit(0) except ValueError as e: print(f"Error parsing GPU config: {e}", file=sys.stderr) sys.exit(1) # Handle --model (specific model) or --params-b (generic model) # Lidar com --model (modelo específico) ou --params-b (modelo genérico) model = None use_generic = False # Generic model takes precedence / Modelo genérico tem precedência if args.params_b is not None: # Create generic model / Criar modelo genérico params_b = args.params_b kv_cache = args.kv_cache if args.kv_cache else estimate_kv_cache(params_b) model_name = args.model_name if args.model_name else f"Custom Model {params_b}B" model = LLMModel( name=model_name, params_billion=params_b, architecture="decoder-only", precision_default="fp16", kv_cache_mb_per_token=kv_cache, ) use_generic = True elif args.model: model = get_model_by_size(args.model) if not model: # Model not found in database - offer to use as generic # Modelo não encontrado no banco - oferecer usar como genérico print(f"\n{Colors.warn('Model size ' + str(args.model) + 'B not found in database.')}") print(f"{Colors.dim('Using generic model with estimated KV cache.')}") print(f"Use --params-b {args.model} --kv-cache <value> for custom KV cache.") print(f"Or use --list-models to see all available models.\n") # Create generic model as fallback / Criar modelo genérico como fallback kv_cache = args.kv_cache if args.kv_cache else estimate_kv_cache(args.model) model = LLMModel( name=f"Generic Model {args.model}B", params_billion=args.model, architecture="decoder-only", precision_default="fp16", kv_cache_mb_per_token=kv_cache, ) use_generic = True if model: # Show VRAM breakdown for the specific model print_model_vram_breakdown(model, args.context, quantization, calculation_mode) # Determine which GPU to use for advanced calculations target_gpu = None if args.optimize_config or args.cpu_offload: # Use largest GPU or allow user to specify via --gpu flag # For now, use the largest available GPU gpus = get_all_gpus() target_gpu = max(gpus, key=lambda g: g.vram_gb) # For CPU offload, get target GPU from user or default to largest if args.cpu_offload: cpu_calc = CPUOffloadCalculator( quantization=quantization, system_ram_gb=args.system_ram, pcie_generation=args.pcie_gen, model_format=model_format, ) cpu_result = cpu_calc.calculate_offload(model, target_gpu, args.context) print_cpu_offload_result(cpu_result, model) sys.exit(0) # For layer offload optimization if args.optimize_config: layer_calc = LayerOffloadCalculator( quantization=quantization, model_format=model_format, ) layer_result = layer_calc.calculate_optimal_offload(model, target_gpu, args.context) print_layer_offload_result(layer_result, model, target_gpu) # Show offload options for all GPUs print(f"\n{Colors.CYAN}Offload Options for All GPUs:{Colors.RESET}") for gpu in sorted(get_all_gpus(), key=lambda g: g.vram_gb, reverse=True): result = layer_calc.calculate_optimal_offload(model, gpu, args.context) status_color = Colors.ok if result.status == "full_gpu" else Colors.warning print(f" {gpu.name:<25} ({gpu.vram_gb:3} GB): " f"{result.layers_on_gpu}/{result.total_layers} layers on GPU " f"({status_color(result.status)}){Colors.RESET}") print("\n" + "=" * 70) sys.exit(0) # Show which GPUs can run this model gpus = get_all_gpus() calculator = VRAMCalculator(quantization=quantization, calculation_mode=calculation_mode) results = [] for gpu in gpus: result = calculator.evaluate_pair(model, gpu, args.context) results.append(result) print(f"\nGPU COMPATIBILITY ({args.context:,} tokens, {quantization.value.upper()}):") print("-" * 70) runnable = [r for r in results if r.status == Status.RUNS] not_runnable = [r for r in results if r.status != Status.RUNS] if runnable: print(f"\n✓ RUNS on these GPUs:") for r in sorted(runnable, key=lambda x: x.gpu_vram_gb): free_pct = r.vram_free_percent print(f" {r.gpu_name:<25} ({r.gpu_vram_gb:3} GB) - " f"{free_pct:4.1f}% free") if not_runnable: print(f"\n✗ DOESN'T RUN - needs more VRAM:") sorted_by_need = sorted(not_runnable, key=lambda x: -(x.required_vram_gb - x.gpu_vram_gb))[:5] for r in sorted_by_need: print(f" {r.gpu_name:<25} ({r.gpu_vram_gb:3} GB) - " f"needs {r.required_vram_gb:.1f} GB") # Suggest layer offload option print(f"\n{Colors.info('💡 Tip: Use --optimize-config to see layer offload options')}") print(f"{Colors.dim(' Some layers can run on GPU while others use system RAM.')}") print("\n" + "=" * 70) sys.exit(0) # Original flow: show all combinations # Fluxo original: mostrar todas as combinações # Quantization info # Info sobre quantização bytes_per_param = quantization.bytes_per_param kv_mult = quantization.kv_cache_multiplier if args.quantization != "fp16": print( f"\nℹ️ Using {args.quantization.upper()}: / Usando {args.quantization.upper()}: " f"{bytes_per_param} bytes/param, " f"KV cache ×{kv_mult}" ) # Select GPUs # Selecionar GPUs if args.gpu_type == "consumer": gpus = get_consumer_gpus() gpu_type_label = "Consumer" elif args.gpu_type == "datacenter": gpus = get_datacenter_gpus() gpu_type_label = "Datacenter" else: gpus = get_all_gpus() gpu_type_label = "All / Todas" # Select models # Selecionar modelos models = get_all_models() # Calculate # Calcular calculator = VRAMCalculator(quantization=quantization, calculation_mode=calculation_mode) results = [] for model in models: for gpu in gpus: result = calculator.evaluate_pair(model, gpu, args.context) results.append(result) # Header print("\n" + "=" * 70) print("LLM LOCAL INFERENCE VIABILITY CALCULATOR") print("CALCULADORA DE VIABILIDADE DE INFERÊNCIA LOCAL DE LLMs") print("=" * 70) print(f"\nConfiguration / Configuração:") print(f" • Batch size: 1 (inference)") print(f" • Context: / Contexto: {args.context:,} tokens") print(f" • Quantization: / Quantização: {Colors.bold(args.quantization.upper())}") print(f" • Mode: / Modo: {args.mode}") print(f" • GPUs: {gpu_type_label} ({len(gpus)} models / modelos)") print(f" • LLM Models: / Modelos LLM: {len(models)} sizes / tamanhos") # Main table # Tabela principal print_table( results, group_by_gpu=args.group_gpu, show_only_runs=args.only_runs, ) # Summaries # Resumos if args.summary in ("model", "both"): print_summary_by_model(results) if args.summary in ("gpu", "both"): print_summary_by_gpu(results) # Exports # Exportações if args.export_csv: export_csv(results, args.export_csv) if args.export_json: export_json(results, args.export_json, args.context, quantization) # Warning for 24GB GPUs running near limit # Aviso para GPUs de 24GB rodando no limite tight_24gb = [ r for r in results if r.status == Status.RUNS and r.gpu_vram_gb == 24 and 22 <= r.required_vram_gb <= 24 ] if tight_24gb: print("\n" + "⚠️ " * 12) print(f"\n{Colors.BOLD}NOTICE: 24GB GPUs running at the limit:{Colors.RESET}") print(f" {Colors.warning('• Any batching (batch_size > 1) may cause OOM')}") print(f" {Colors.warning('• LoRA adapters add ~0.5-2GB per adapter')}") print(f" {Colors.warning('• Speculative decoding adds ~30-50% memory')}") print(f" {Colors.warning('• Tool calling / function calling adds overhead')}") print(f"\n{Colors.DIM} Assumptions for calculations above:{Colors.RESET}") print(f" Memory model: PyTorch allocator (HF Transformers, vLLM)") print(f" KV cache: FP16 (quantized KV is experimental/exclusive)") print(f" batch_size = 1 (no batching)") print(f" No LoRA adapters active") print(f" No speculative decoding") print(f" No tool calling overhead") print(f" {Colors.DIM}Note: TensorRT-LLM, llama.cpp, EXL2 may have different behavior{Colors.RESET}") print() print("\n" + "=" * 70) print()
if __name__ == "__main__": main()