Código-fonte para calculator

"""
VRAM usage calculator for LLM inference.

Implements the logic for estimating the memory required to run
language models on specific GPUs.

Calculadora de uso de VRAM para inferência de LLMs.
Implementa a lógica de estimativa de memória necessária para rodar
modelos de linguagem em GPUs específicas.
"""

from dataclasses import dataclass, field
from enum import Enum
from typing import List, Optional, Dict

from models import LLMModel, get_all_models
from gpus import GPU, get_all_gpus
from formats import ModelFormat, FORMAT_OVERHEAD



[documentos]
class Quantization(Enum):
    """Supported precision/quantization types for inference.

    Tipos de precisão/quantização suportados para inferência.
    """

    FP32 = "fp32"      # 4 bytes per parameter (float32) / 4 bytes por parâmetro
    FP16 = "fp16"      # 2 bytes per parameter (float16 / half precision)
    INT8 = "int8"      # 1 byte per parameter (8-bit quantization)
    INT4 = "int4"      # 0.5 byte per parameter (4-bit quantization, packed)

    @property
    def bytes_per_param(self) -> float:
        """Returns bytes per parameter for this precision.

        Retorna bytes por parâmetro para esta precisão.
        """
        return BYTES_PER_PARAM[self]

    @property
    def kv_cache_multiplier(self) -> float:
        """KV cache multiplier based on precision.

        Multiplicador para KV cache baseado na precisão.

        Note: KV cache usually remains in FP16 even with quantized weights,
        but some frameworks support quantized KV cache with INT8/INT4.

        Nota: KV cache geralmente permanece em FP16 mesmo com pesos quantizados,
        mas com INT8/INT4 alguns frameworks suportam KV cache quantizado.
        """
        return KV_CACHE_MULTIPLIER.get(self, 1.0)




[documentos]
class Status(Enum):
    """Inference feasibility status.

    Status de viabilidade da inferência.
    """
    RUNS = "RUNS"      # RUNS / RODA
    NOT_RUNS = "DOESN'T RUN"  # DOESN'T RUN / NÃO RODA




[documentos]
class CalculationMode(Enum):
    """VRAM calculation mode.

    Modo de cálculo de VRAM.
    """
    THEORETICAL = "theoretical"   # Ideal minimum, batch=1, no padding/alignment
    CONSERVATIVE = "conservative"   # Current default, some overhead buffer
    PRODUCTION = "production"     # Real-world serving (batch>1, buffers, fragmentation)




[documentos]
@dataclass
class InferenceResult:
    """Feasibility analysis result for a model × GPU pair.

    Resultado de análise de viabilidade para um par modelo × GPU.

    Attributes:
        model_name: Name of the LLM model
        model_params_billion: Model size in billions of parameters
        gpu_name: Name of the GPU
        gpu_vram_gb: GPU VRAM capacity in GB
        required_vram_gb: VRAM required for inference
        status: Feasibility status (RUNS or DOESN'T RUN)
        vram_free_percent: Percentage of VRAM remaining
        quantization: Quantization type used
        warning: Optional warning message
    """

    model_name: str
    model_params_billion: int
    gpu_name: str
    gpu_vram_gb: int
    required_vram_gb: float
    status: Status
    vram_free_percent: float
    quantization: Quantization
    warning: str | None = None


[documentos]
    def to_dict(self) -> dict:
        """Converts to serializable dictionary.

        Converte para dicionário serializável.
        """
        return {
            "model": self.model_name,
            "model_size": f"{self.model_params_billion}B",
            "gpu": self.gpu_name,
            "gpu_vram_gb": self.gpu_vram_gb,
            "required_vram_gb": round(self.required_vram_gb, 2),
            "status": self.status.value,
            "vram_free_percent": round(self.vram_free_percent, 1),
            "quantization": self.quantization.value,
            "warning": self.warning,
        }





[documentos]
@dataclass
class CalculationBreakdown:
    """Detailed VRAM calculation breakdown.

    Breakdown detalhado do cálculo de VRAM.

    Attributes:
        params_memory_gb: Memory for model parameters in GB
        overhead_gb: Memory overhead in GB
        model_with_overhead_gb: Parameters + overhead in GB
        kv_cache_gb: KV cache memory in GB
        total_vram_gb: Total VRAM required in GB
    """

    params_memory_gb: float
    overhead_gb: float
    model_with_overhead_gb: float
    kv_cache_gb: float
    total_vram_gb: float


[documentos]
    def to_dict(self) -> dict:
        """Converts to serializable dictionary.

        Converte para dicionário serializável.
        """
        return {
            "params_memory_gb": round(self.params_memory_gb, 2),
            "overhead_gb": round(self.overhead_gb, 2),
            "model_with_overhead_gb": round(self.model_with_overhead_gb, 2),
            "kv_cache_gb": round(self.kv_cache_gb, 2),
            "total_vram_gb": round(self.total_vram_gb, 2),
        }




# ============================================================================
# CALCULATION CONSTANTS (Conservative values)
# CONSTANTES DE CÁLCULO (Valores conservadores)
# ============================================================================

# Bytes per parameter for each precision
# Bytes por parâmetro para cada precisão
BYTES_PER_PARAM = {
    Quantization.FP32: 4.0,  # 32 bits = 4 bytes
    Quantization.FP16: 2.0,  # 16 bits = 2 bytes
    Quantization.INT8: 1.0,  # 8 bits = 1 byte
    Quantization.INT4: 0.5,  # 4 bits = 0.5 byte (packed)
}

# KV cache multiplier per precision
# Multiplicador do KV cache por precisão
# IMPORTANT: In most production stacks today, KV cache stays in FP16/BF16
# even with quantized weights (INT4/INT8). KV cache quantization is experimental.
# IMPORTANTE: Na maioria dos stacks de produção hoje, KV cache permanece em FP16/BF16
# mesmo com pesos quantizados (INT4/INT8). Quantização de KV cache é experimental.
#
# Conservative values: assume FP16 for KV cache unless explicitly optimized
# Valores conservadores: assumimos FP16 para KV cache exceto quando otimizado explicitamente
KV_CACHE_MULTIPLIER = {
    Quantization.FP32: 2.0,  # FP32 uses 2x the space of FP16
    Quantization.FP16: 1.0,  # Baseline - standard for most frameworks
    Quantization.INT8: 0.85,  # INT8 weights, but KV cache often FP16 (conservative)
    Quantization.INT4: 0.85,  # INT4 weights, but KV cache usually FP16 (realistic)
}
# Note: 0.85 assumes some KV cache optimization (paged KV, compression).
# For strict real-world accuracy with INT4 weights, use 1.0 (FP16 KV cache).
# Only vLLM paged KV, custom kernels, or EXL2-like backends support quantized KV cache.

# Overhead factor (runtime, activations, etc.)
# Fator de overhead (runtime, activations, etc.)
# Conservative: framework + memory overhead during inference
# Conservador: framework + overhead de memória durante inferência
OVERHEAD_FACTOR = 0.30  # 30% overhead

# Minimum safety margin to consider viable
# Margem de segurança mínima para considerar viável
SAFETY_MARGIN_THRESHOLD = 0.10  # 10%



[documentos]
class VRAMCalculator:
    """VRAM calculator for LLM inference.

    Calculadora de VRAM para inferência de LLMs.
    """


[documentos]
    def __init__(
        self,
        quantization: Quantization = Quantization.FP16,
        overhead_factor: float = OVERHEAD_FACTOR,
        calculation_mode: CalculationMode = CalculationMode.CONSERVATIVE,
    ):
        """Initialize the calculator.

        Inicializa a calculadora.

        Args:
            quantization: Quantization type (default: FP16)
            overhead_factor: Overhead factor (0.30 = 30%)
            calculation_mode: Calculation mode for VRAM estimation
        """
        self.quantization = quantization
        self.overhead_factor = overhead_factor
        self.calculation_mode = calculation_mode



[documentos]
    def calculate_params_memory(self, params_billion: int) -> float:
        """Calculate base memory for model parameters.

        Calcula memória base dos parâmetros do modelo.

        Formula: params_memory_gb = params_billion * bytes_per_param
        Fórmula: params_memory_gb = params_billion × BYTES_PER_PARAM

        Note: params_billion is in billions, and 1 billion bytes = 1 GB.
        So for FP16 (2 bytes/param): 70B model = 70 × 2 = 140 GB

        Args:
            params_billion: Model size in billions of parameters

        Returns:
            Memory in GB
        """
        bytes_per_param = BYTES_PER_PARAM[self.quantization]
        # params_billion is in billions, 1 billion bytes = 1 GB
        # For FP16: 7B × 2 bytes = 14 GB, 70B × 2 bytes = 140 GB
        params_memory_gb = params_billion * bytes_per_param
        return params_memory_gb



[documentos]
    def calculate_overhead(self, params_memory_gb: float) -> float:
        """Calculate memory overhead (runtime, activations, etc.).

        Calcula overhead de memória (runtime, activations, etc.).

        Formula: overhead = params_memory * overhead_factor
        Fórmula: overhead = params_memory × overhead_factor

        Args:
            params_memory_gb: Base parameter memory in GB

        Returns:
            Overhead in GB
        """
        return params_memory_gb * self.overhead_factor



[documentos]
    def calculate_kv_cache(
        self,
        kv_cache_mb_per_token: float,
        context_tokens: int,
    ) -> float:
        """Calculate memory required for KV cache.

        Calcula memória necessária para KV cache.

        Formula: kv_cache_gb = (kv_cache_mb_per_token * context_tokens * multiplier * mode_buffer) / 1024
        Fórmula: kv_cache_gb = (kv_cache_mb_per_token × context_tokens × multiplier × mode_buffer) / 1024

        The base KV cache is defined for FP16. For other precisions, we apply
        a multiplier: FP32 uses 2x, INT8/INT4 may use less depending on the framework.

        O KV cache base é definido para FP16. Para outras precisões, aplicamos
        um multiplicador: FP32 usa 2x, INT8/INT4 podem usar menos dependendo do framework.

        The calculation mode adds a buffer for production scenarios:
        - THEORETICAL: No buffer (ideal minimum, batch=1, no padding)
        - CONSERVATIVE: 10% buffer (minimal overhead)
        - PRODUCTION: 25% buffer (batch>1, fragmentation, real-world serving)

        Args:
            kv_cache_mb_per_token: MB per token for the model (FP16 baseline)
            context_tokens: Context size in tokens

        Returns:
            KV cache in GB
        """
        multiplier = self.quantization.kv_cache_multiplier

        # Production buffer based on calculation mode
        # Buffer de produção baseado no modo de cálculo
        mode_buffer = {
            CalculationMode.THEORETICAL: 1.0,   # No extra buffer / Sem buffer extra
            CalculationMode.CONSERVATIVE: 1.1,  # 10% buffer for overhead
            CalculationMode.PRODUCTION: 1.25,   # 25% buffer for real-world serving
        }.get(self.calculation_mode, 1.0)

        kv_cache_mb = kv_cache_mb_per_token * context_tokens * multiplier * mode_buffer
        kv_cache_gb = kv_cache_mb / 1024
        return kv_cache_gb



[documentos]
    def calculate_total_vram(
        self,
        model: LLMModel,
        context_tokens: int,
    ) -> CalculationBreakdown:
        """Calculate total VRAM required for a model with given context.

        Calcula VRAM total necessária para um modelo com contexto dado.

        Args:
            model: LLM model to evaluate
            context_tokens: Context size in tokens

        Returns:
            CalculationBreakdown with calculation details
        """
        # 5.1 Model parameters
        # Parâmetros do modelo
        params_memory_gb = self.calculate_params_memory(model.params_billion)

        # 5.2 Overhead
        overhead_gb = self.calculate_overhead(params_memory_gb)
        model_with_overhead_gb = params_memory_gb + overhead_gb

        # 5.3 KV Cache
        kv_cache_gb = self.calculate_kv_cache(
            model.kv_cache_mb_per_token,
            context_tokens,
        )

        # 5.4 Total VRAM
        # VRAM total
        total_vram_gb = model_with_overhead_gb + kv_cache_gb

        return CalculationBreakdown(
            params_memory_gb=params_memory_gb,
            overhead_gb=overhead_gb,
            model_with_overhead_gb=model_with_overhead_gb,
            kv_cache_gb=kv_cache_gb,
            total_vram_gb=total_vram_gb,
        )



[documentos]
    def evaluate_pair(
        self,
        model: LLMModel,
        gpu: GPU,
        context_tokens: int,
        quantization: Quantization | None = None,
    ) -> InferenceResult:
        """Evaluate if a model × GPU pair is viable for the given context.

        Avalia se um par modelo × GPU é viável para o contexto dado.

        Args:
            model: LLM model to evaluate
            gpu: GPU to evaluate
            context_tokens: Context size in tokens
            quantization: Override quantization (uses instance default if None)

        Returns:
            InferenceResult with status and details
        """
        breakdown = self.calculate_total_vram(model, context_tokens)
        required_vram = breakdown.total_vram_gb

        # 6. Decision logic
        # Lógica de decisão
        if required_vram <= gpu.vram_gb:
            status = Status.RUNS
            vram_free_percent = ((gpu.vram_gb - required_vram) / gpu.vram_gb) * 100

            # Warning if margin < 10%
            # Aviso se margem < 10%
            warning = None
            if vram_free_percent < (SAFETY_MARGIN_THRESHOLD * 100):
                warning = f"Low safety margin ({vram_free_percent:.1f}% free) / Margem de segurança baixa"

        else:
            status = Status.NOT_RUNS
            vram_free_percent = 0.0
            warning = None

        return InferenceResult(
            model_name=model.name,
            model_params_billion=model.params_billion,
            gpu_name=gpu.name,
            gpu_vram_gb=gpu.vram_gb,
            required_vram_gb=required_vram,
            status=status,
            vram_free_percent=vram_free_percent,
            quantization=quantization or self.quantization,
            warning=warning,
        )



[documentos]
    def calculate_all_combinations(
        self,
        context_tokens: int,
        models: List[LLMModel] | None = None,
        gpus: List[GPU] | None = None,
    ) -> List[InferenceResult]:
        """Calculate feasibility for all model × GPU combinations.

        Calcula viabilidade para todas as combinações modelo × GPU.

        Args:
            context_tokens: Context size in tokens
            models: List of models (uses all if None)
            gpus: List of GPUs (uses all if None)

        Returns:
            List of InferenceResult for all combinations
        """
        if models is None:
            models = get_all_models()
        if gpus is None:
            gpus = get_all_gpus()

        results = []
        for model in models:
            for gpu in gpus:
                result = self.evaluate_pair(model, gpu, context_tokens)
                results.append(result)

        return results





[documentos]
def calculate_inference(context_tokens: int) -> dict:
    """Main calculation function (simplified interface).

    Função principal de cálculo (interface simplificada).

    Args:
        context_tokens: Context size in tokens

    Returns:
        Dictionary with structured results
    """
    calculator = VRAMCalculator()
    results = calculator.calculate_all_combinations(context_tokens)

    return {
        "context_tokens": context_tokens,
        "quantization": calculator.quantization.value,
        "results": [r.to_dict() for r in results],
    }



# ============================================================================
# LAYER OFFLOAD CALCULATOR
# CALCULADORA DE OFFLOAD DE CAMADAS
# ============================================================================


[documentos]
@dataclass
class LayerOffloadResult:
    """Result for optimal layer offload calculation.

    Resultado para cálculo de offload ótimo de camadas.

    Attributes:
        total_layers: Total number of layers in the model
        layers_on_gpu: Number of layers that fit on GPU
        layers_on_cpu: Number of layers that must remain on CPU
        gpu_vram_used: VRAM used for GPU layers
        cpu_ram_used: System RAM used for CPU layers
        offload_ratio: Ratio of layers on GPU (0.0 to 1.0)
        performance_impact: Estimated performance impact (0-100% slower)
        recommended_gpu_split: Recommended --gpu-layers parameter for llama.cpp
    """
    total_layers: int
    layers_on_gpu: int
    layers_on_cpu: int
    gpu_vram_used: float  # GB
    cpu_ram_used: float   # GB
    offload_ratio: float
    performance_impact: float  # Percentage slower
    recommended_gpu_split: str
    status: str  # "full_gpu", "partial_offload", "cpu_only"




[documentos]
class LayerOffloadCalculator:
    """Calculates optimal layer offload configuration for hybrid GPU+CPU inference.

    Calcula configuração ótima de offload de camadas para inferência híbrida GPU+CPU.

    This calculator helps determine how many transformer layers can fit in GPU VRAM
    for scenarios where the full model doesn't fit, enabling partial offloading
    strategies used by llama.cpp, AutoGPTQ, and other frameworks.

    Esta calculadora ajuda a determinar quantas camadas transformer cabem na VRAM
    para cenários onde o modelo completo não cabe, permitindo estratégias de offload
    parcial usadas por llama.cpp, AutoGPTQ e outros frameworks.
    """


[documentos]
    def __init__(
        self,
        quantization: Quantization = Quantization.FP16,
        safety_margin_gb: float = 1.0,
        model_format: ModelFormat = ModelFormat.FP16,
    ):
        """Initialize the layer offload calculator.

        Inicializa a calculadora de offload de camadas.

        Args:
            quantization: Quantization type for layer size calculation
            safety_margin_gb: Safety margin in GB to reserve
            model_format: Model format for overhead calculation
        """
        self.quantization = quantization
        self.safety_margin_gb = safety_margin_gb
        self.model_format = model_format



[documentos]
    def estimate_layer_size_gb(
        self,
        model: LLMModel,
    ) -> float:
        """Estimate the VRAM size of a single transformer layer.

        Estima o tamanho VRAM de uma única camada transformer.

        Args:
            model: LLM model to analyze

        Returns:
            Estimated size of one layer in GB

        Formula:
            layer_size = (model_params / num_layers) * bytes_per_param * format_overhead
        """
        num_layers = model.estimated_layers
        params_per_layer = model.params_billion / num_layers

        bytes_per_param = BYTES_PER_PARAM[self.quantization]
        format_multiplier = FORMAT_OVERHEAD.get(self.model_format, 1.0)

        # Layer size in GB (params_billion is already in "GB worth of bytes at 1 byte/param")
        layer_size_gb = params_per_layer * bytes_per_param * format_multiplier

        return layer_size_gb



[documentos]
    def calculate_kv_cache_memory(
        self,
        model: LLMModel,
        context_tokens: int,
    ) -> float:
        """Calculate KV cache memory requirement.

        Calcula requisito de memória de KV cache.

        Args:
            model: LLM model
            context_tokens: Context size in tokens

        Returns:
            KV cache memory in GB
        """
        kv_cache_mb = model.kv_cache_mb_per_token * context_tokens
        kv_cache_gb = kv_cache_mb / 1024
        return kv_cache_gb



[documentos]
    def calculate_optimal_offload(
        self,
        model: LLMModel,
        gpu: GPU,
        context_tokens: int,
    ) -> LayerOffloadResult:
        """Calculate optimal layer offload configuration.

        Calcula configuração ótima de offload de camadas.

        Args:
            model: LLM model to analyze
            gpu: GPU to use for offload
            context_tokens: Context size in tokens

        Returns:
            LayerOffloadResult with optimal configuration
        """
        total_layers = model.estimated_layers
        layer_size_gb = self.estimate_layer_size_gb(model)

        # Calculate available VRAM for layers
        # Available = Total VRAM - KV cache - overhead - safety margin
        kv_cache_gb = self.calculate_kv_cache_memory(model, context_tokens)

        # Runtime overhead (activations, buffers)
        params_memory_gb = model.params_billion * BYTES_PER_PARAM[self.quantization]
        overhead_gb = params_memory_gb * OVERHEAD_FACTOR

        # VRAM available for layers
        available_for_layers = gpu.vram_gb - kv_cache_gb - overhead_gb - self.safety_margin_gb

        # Calculate how many layers fit on GPU
        if available_for_layers <= 0:
            # Not enough VRAM even for KV cache
            layers_on_gpu = 0
        else:
            layers_on_gpu = min(total_layers, int(available_for_layers / layer_size_gb))

        layers_on_cpu = total_layers - layers_on_gpu
        offload_ratio = layers_on_gpu / total_layers if total_layers > 0 else 0

        # Calculate memory usage
        gpu_vram_used = layers_on_gpu * layer_size_gb + kv_cache_gb + overhead_gb
        cpu_ram_used = layers_on_cpu * layer_size_gb

        # Estimate performance impact
        # Full GPU: 0% impact
        # Partial offload: 2-5% slower per 10% of layers on CPU (PCIe bottleneck)
        # CPU only: ~10-20x slower (very rough estimate)
        if layers_on_gpu == total_layers:
            performance_impact = 0.0
            status = "full_gpu"
        elif layers_on_gpu == 0:
            performance_impact = 1000.0  # ~10x slower = 1000% slower
            status = "cpu_only"
        else:
            # Estimate: 3% slower per 10% of layers on CPU
            cpu_ratio = layers_on_cpu / total_layers
            performance_impact = cpu_ratio * 30.0  # 30% per full offload equivalent
            status = "partial_offload"

        # Recommended llama.cpp --gpu-layers parameter
        if layers_on_gpu == 0:
            recommended = "0 (CPU only)"
        elif layers_on_gpu == total_layers:
            recommended = f"{layers_on_gpu} (all layers, full GPU)"
        else:
            recommended = f"{layers_on_gpu} (of {total_layers} total)"

        return LayerOffloadResult(
            total_layers=total_layers,
            layers_on_gpu=layers_on_gpu,
            layers_on_cpu=layers_on_cpu,
            gpu_vram_used=gpu_vram_used,
            cpu_ram_used=cpu_ram_used,
            offload_ratio=offload_ratio,
            performance_impact=performance_impact,
            recommended_gpu_split=recommended,
            status=status,
        )




# ============================================================================
# CPU OFFLOAD CALCULATOR
# CALCULADORA DE OFFLOAD DE CPU
# ============================================================================


[documentos]
@dataclass
class PCIeConfig:
    """PCIe bandwidth configuration.

    Configuração de largura de banda PCIe.

    Attributes:
        generation: PCIe generation (3.0, 4.0, 5.0)
        bandwidth_gb_s: Theoretical bandwidth in GB/s (per lane x16)
        lanes: Number of lanes (typically x16 for GPUs)
    """
    generation: str
    bandwidth_gb_s: float
    lanes: int = 16

    @property
    def effective_bandwidth_gb_s(self) -> float:
        """Effective bandwidth accounting for protocol overhead.

        Largura de banda efetiva considerando overhead de protocolo.
        """
        # Real-world effective bandwidth is ~70-80% of theoretical
        return self.bandwidth_gb_s * 0.75



# PCIe bandwidth specifications (x16)
# Especificações de largura de banda PCIe (x16)
PCIE_CONFIGS: Dict[str, PCIeConfig] = {
    "3.0": PCIeConfig(generation="3.0", bandwidth_gb_s=16.0, lanes=16),   # ~16 GB/s theoretical
    "4.0": PCIeConfig(generation="4.0", bandwidth_gb_s=32.0, lanes=16),   # ~32 GB/s theoretical
    "5.0": PCIeConfig(generation="5.0", bandwidth_gb_s=64.0, lanes=16),   # ~64 GB/s theoretical
}



[documentos]
@dataclass
class CPUOffloadResult:
    """Result for CPU offload calculation.

    Resultado para cálculo de offload de CPU.

    Attributes:
        system_ram_required: Total system RAM required in GB
        system_ram_available: System RAM available in GB
        fits_in_ram: Whether the model fits in system RAM
        offload_config: Layer offload configuration
        pcie_generation: PCIe generation used
        estimated_token_speed: Estimated tokens/second with offload
        speed_vs_full_gpu: Speed ratio vs full GPU (0.0 to 1.0)
    """
    system_ram_required: float  # GB
    system_ram_available: float  # GB
    fits_in_ram: bool
    offload_config: LayerOffloadResult
    pcie_generation: str
    estimated_token_speed: float  # tokens/second
    speed_vs_full_gpu: float  # Ratio (0.0 to 1.0)




[documentos]
class CPUOffloadCalculator:
    """Calculates hybrid GPU+CPU inference configurations.

    Calcula configurações de inferência híbrida GPU+CPU.

    This calculator helps determine:
    - How much system RAM is needed for CPU offload
    - Performance impact based on PCIe bandwidth
    - Optimal layer distribution between GPU and CPU

    Esta calculadora ajuda a determinar:
    - Quanta RAM do sistema é necessária para offload de CPU
    - Impacto de performance baseado em largura de banda PCIe
    - Distribuição ótima de camadas entre GPU e CPU
    """


[documentos]
    def __init__(
        self,
        quantization: Quantization = Quantization.FP16,
        system_ram_gb: float = 32.0,
        pcie_generation: str = "4.0",
        model_format: ModelFormat = ModelFormat.FP16,
    ):
        """Initialize the CPU offload calculator.

        Inicializa a calculadora de offload de CPU.

        Args:
            quantization: Quantization type
            system_ram_gb: Available system RAM in GB
            pcie_generation: PCIe generation (3.0, 4.0, 5.0)
            model_format: Model format
        """
        self.quantization = quantization
        self.system_ram_gb = system_ram_gb
        self.pcie_config = PCIE_CONFIGS.get(pcie_generation, PCIE_CONFIGS["4.0"])
        self.model_format = model_format
        self.layer_calculator = LayerOffloadCalculator(
            quantization=quantization,
            model_format=model_format,
        )



[documentos]
    def calculate_total_model_memory(self, model: LLMModel) -> float:
        """Calculate total model memory requirement.

        Calcula requisito total de memória do modelo.

        Args:
            model: LLM model

        Returns:
            Total memory in GB
        """
        bytes_per_param = BYTES_PER_PARAM[self.quantization]
        format_multiplier = FORMAT_OVERHEAD.get(self.model_format, 1.0)

        # Base parameter memory
        params_memory = model.params_billion * bytes_per_param * format_multiplier

        # Add overhead for runtime
        overhead = params_memory * OVERHEAD_FACTOR

        return params_memory + overhead



[documentos]
    def estimate_offload_performance(
        self,
        offload_result: LayerOffloadResult,
        model: LLMModel,
    ) -> tuple[float, float]:
        """Estimate performance with offload configuration.

        Estima performance com configuração de offload.

        Args:
            offload_result: Layer offload configuration result
            model: LLM model

        Returns:
            Tuple of (estimated_tokens_per_second, speed_ratio_vs_full_gpu)
        """
        # Baseline: full GPU inference on RTX 3090-class hardware
        # These are rough estimates for a 7B model at FP16
        # Adjust by model size
        baseline_speed = 50.0 / (model.params_billion / 7.0)  # tokens/sec

        if offload_result.status == "full_gpu":
            return baseline_speed, 1.0
        elif offload_result.status == "cpu_only":
            # CPU inference: ~10-20x slower
            cpu_speed = baseline_speed / 15.0
            return cpu_speed, 1.0 / 15.0
        else:
            # Partial offload: speed depends on GPU layer ratio
            # and PCIe bandwidth for data transfer
            gpu_ratio = offload_result.offload_ratio

            # Base speed from GPU layers
            gpu_speed = baseline_speed * gpu_ratio

            # CPU contribution (slower)
            cpu_speed = (baseline_speed / 15.0) * (1 - gpu_ratio)

            # Combine (simplified model)
            estimated_speed = gpu_speed + cpu_speed

            # Adjust for PCIe bandwidth bottleneck
            # Faster PCIe = better performance with offload
            pcie_multiplier = self.pcie_config.effective_bandwidth_gb_s / 24.0  # Normalized to PCIe 4.0
            estimated_speed *= (0.7 + 0.3 * pcie_multiplier)

            speed_ratio = estimated_speed / baseline_speed

            return estimated_speed, speed_ratio



[documentos]
    def calculate_offload(
        self,
        model: LLMModel,
        gpu: GPU,
        context_tokens: int,
    ) -> CPUOffloadResult:
        """Calculate CPU offload configuration.

        Calcula configuração de offload de CPU.

        Args:
            model: LLM model
            gpu: GPU to use
            context_tokens: Context size in tokens

        Returns:
            CPUOffloadResult with configuration details
        """
        # Get layer offload configuration
        offload_config = self.layer_calculator.calculate_optimal_offload(
            model, gpu, context_tokens
        )

        # Calculate system RAM requirement
        # CPU layers + any additional runtime overhead on CPU
        total_memory = self.calculate_total_model_memory(model)
        system_ram_required = offload_config.cpu_ram_used + 2.0  # +2GB for CPU runtime overhead

        fits_in_ram = system_ram_required <= self.system_ram_gb

        # Estimate performance
        estimated_speed, speed_ratio = self.estimate_offload_performance(
            offload_config, model
        )

        return CPUOffloadResult(
            system_ram_required=system_ram_required,
            system_ram_available=self.system_ram_gb,
            fits_in_ram=fits_in_ram,
            offload_config=offload_config,
            pcie_generation=self.pcie_config.generation,
            estimated_token_speed=estimated_speed,
            speed_vs_full_gpu=speed_ratio,
        )