"""
VRAM usage calculator for LLM inference.
Implements the logic for estimating the memory required to run
language models on specific GPUs.
Calculadora de uso de VRAM para inferência de LLMs.
Implementa a lógica de estimativa de memória necessária para rodar
modelos de linguagem em GPUs específicas.
"""
from dataclasses import dataclass, field
from enum import Enum
from typing import List, Optional, Dict
from models import LLMModel, get_all_models
from gpus import GPU, get_all_gpus
from formats import ModelFormat, FORMAT_OVERHEAD
[documentos]
class Quantization(Enum):
"""Supported precision/quantization types for inference.
Tipos de precisão/quantização suportados para inferência.
"""
FP32 = "fp32" # 4 bytes per parameter (float32) / 4 bytes por parâmetro
FP16 = "fp16" # 2 bytes per parameter (float16 / half precision)
INT8 = "int8" # 1 byte per parameter (8-bit quantization)
INT4 = "int4" # 0.5 byte per parameter (4-bit quantization, packed)
@property
def bytes_per_param(self) -> float:
"""Returns bytes per parameter for this precision.
Retorna bytes por parâmetro para esta precisão.
"""
return BYTES_PER_PARAM[self]
@property
def kv_cache_multiplier(self) -> float:
"""KV cache multiplier based on precision.
Multiplicador para KV cache baseado na precisão.
Note: KV cache usually remains in FP16 even with quantized weights,
but some frameworks support quantized KV cache with INT8/INT4.
Nota: KV cache geralmente permanece em FP16 mesmo com pesos quantizados,
mas com INT8/INT4 alguns frameworks suportam KV cache quantizado.
"""
return KV_CACHE_MULTIPLIER.get(self, 1.0)
[documentos]
class Status(Enum):
"""Inference feasibility status.
Status de viabilidade da inferência.
"""
RUNS = "RUNS" # RUNS / RODA
NOT_RUNS = "DOESN'T RUN" # DOESN'T RUN / NÃO RODA
[documentos]
class CalculationMode(Enum):
"""VRAM calculation mode.
Modo de cálculo de VRAM.
"""
THEORETICAL = "theoretical" # Ideal minimum, batch=1, no padding/alignment
CONSERVATIVE = "conservative" # Current default, some overhead buffer
PRODUCTION = "production" # Real-world serving (batch>1, buffers, fragmentation)
[documentos]
@dataclass
class InferenceResult:
"""Feasibility analysis result for a model × GPU pair.
Resultado de análise de viabilidade para um par modelo × GPU.
Attributes:
model_name: Name of the LLM model
model_params_billion: Model size in billions of parameters
gpu_name: Name of the GPU
gpu_vram_gb: GPU VRAM capacity in GB
required_vram_gb: VRAM required for inference
status: Feasibility status (RUNS or DOESN'T RUN)
vram_free_percent: Percentage of VRAM remaining
quantization: Quantization type used
warning: Optional warning message
"""
model_name: str
model_params_billion: int
gpu_name: str
gpu_vram_gb: int
required_vram_gb: float
status: Status
vram_free_percent: float
quantization: Quantization
warning: str | None = None
[documentos]
def to_dict(self) -> dict:
"""Converts to serializable dictionary.
Converte para dicionário serializável.
"""
return {
"model": self.model_name,
"model_size": f"{self.model_params_billion}B",
"gpu": self.gpu_name,
"gpu_vram_gb": self.gpu_vram_gb,
"required_vram_gb": round(self.required_vram_gb, 2),
"status": self.status.value,
"vram_free_percent": round(self.vram_free_percent, 1),
"quantization": self.quantization.value,
"warning": self.warning,
}
[documentos]
@dataclass
class CalculationBreakdown:
"""Detailed VRAM calculation breakdown.
Breakdown detalhado do cálculo de VRAM.
Attributes:
params_memory_gb: Memory for model parameters in GB
overhead_gb: Memory overhead in GB
model_with_overhead_gb: Parameters + overhead in GB
kv_cache_gb: KV cache memory in GB
total_vram_gb: Total VRAM required in GB
"""
params_memory_gb: float
overhead_gb: float
model_with_overhead_gb: float
kv_cache_gb: float
total_vram_gb: float
[documentos]
def to_dict(self) -> dict:
"""Converts to serializable dictionary.
Converte para dicionário serializável.
"""
return {
"params_memory_gb": round(self.params_memory_gb, 2),
"overhead_gb": round(self.overhead_gb, 2),
"model_with_overhead_gb": round(self.model_with_overhead_gb, 2),
"kv_cache_gb": round(self.kv_cache_gb, 2),
"total_vram_gb": round(self.total_vram_gb, 2),
}
# ============================================================================
# CALCULATION CONSTANTS (Conservative values)
# CONSTANTES DE CÁLCULO (Valores conservadores)
# ============================================================================
# Bytes per parameter for each precision
# Bytes por parâmetro para cada precisão
BYTES_PER_PARAM = {
Quantization.FP32: 4.0, # 32 bits = 4 bytes
Quantization.FP16: 2.0, # 16 bits = 2 bytes
Quantization.INT8: 1.0, # 8 bits = 1 byte
Quantization.INT4: 0.5, # 4 bits = 0.5 byte (packed)
}
# KV cache multiplier per precision
# Multiplicador do KV cache por precisão
# IMPORTANT: In most production stacks today, KV cache stays in FP16/BF16
# even with quantized weights (INT4/INT8). KV cache quantization is experimental.
# IMPORTANTE: Na maioria dos stacks de produção hoje, KV cache permanece em FP16/BF16
# mesmo com pesos quantizados (INT4/INT8). Quantização de KV cache é experimental.
#
# Conservative values: assume FP16 for KV cache unless explicitly optimized
# Valores conservadores: assumimos FP16 para KV cache exceto quando otimizado explicitamente
KV_CACHE_MULTIPLIER = {
Quantization.FP32: 2.0, # FP32 uses 2x the space of FP16
Quantization.FP16: 1.0, # Baseline - standard for most frameworks
Quantization.INT8: 0.85, # INT8 weights, but KV cache often FP16 (conservative)
Quantization.INT4: 0.85, # INT4 weights, but KV cache usually FP16 (realistic)
}
# Note: 0.85 assumes some KV cache optimization (paged KV, compression).
# For strict real-world accuracy with INT4 weights, use 1.0 (FP16 KV cache).
# Only vLLM paged KV, custom kernels, or EXL2-like backends support quantized KV cache.
# Overhead factor (runtime, activations, etc.)
# Fator de overhead (runtime, activations, etc.)
# Conservative: framework + memory overhead during inference
# Conservador: framework + overhead de memória durante inferência
OVERHEAD_FACTOR = 0.30 # 30% overhead
# Minimum safety margin to consider viable
# Margem de segurança mínima para considerar viável
SAFETY_MARGIN_THRESHOLD = 0.10 # 10%
[documentos]
class VRAMCalculator:
"""VRAM calculator for LLM inference.
Calculadora de VRAM para inferência de LLMs.
"""
[documentos]
def __init__(
self,
quantization: Quantization = Quantization.FP16,
overhead_factor: float = OVERHEAD_FACTOR,
calculation_mode: CalculationMode = CalculationMode.CONSERVATIVE,
):
"""Initialize the calculator.
Inicializa a calculadora.
Args:
quantization: Quantization type (default: FP16)
overhead_factor: Overhead factor (0.30 = 30%)
calculation_mode: Calculation mode for VRAM estimation
"""
self.quantization = quantization
self.overhead_factor = overhead_factor
self.calculation_mode = calculation_mode
[documentos]
def calculate_params_memory(self, params_billion: int) -> float:
"""Calculate base memory for model parameters.
Calcula memória base dos parâmetros do modelo.
Formula: params_memory_gb = params_billion * bytes_per_param
Fórmula: params_memory_gb = params_billion × BYTES_PER_PARAM
Note: params_billion is in billions, and 1 billion bytes = 1 GB.
So for FP16 (2 bytes/param): 70B model = 70 × 2 = 140 GB
Args:
params_billion: Model size in billions of parameters
Returns:
Memory in GB
"""
bytes_per_param = BYTES_PER_PARAM[self.quantization]
# params_billion is in billions, 1 billion bytes = 1 GB
# For FP16: 7B × 2 bytes = 14 GB, 70B × 2 bytes = 140 GB
params_memory_gb = params_billion * bytes_per_param
return params_memory_gb
[documentos]
def calculate_overhead(self, params_memory_gb: float) -> float:
"""Calculate memory overhead (runtime, activations, etc.).
Calcula overhead de memória (runtime, activations, etc.).
Formula: overhead = params_memory * overhead_factor
Fórmula: overhead = params_memory × overhead_factor
Args:
params_memory_gb: Base parameter memory in GB
Returns:
Overhead in GB
"""
return params_memory_gb * self.overhead_factor
[documentos]
def calculate_kv_cache(
self,
kv_cache_mb_per_token: float,
context_tokens: int,
) -> float:
"""Calculate memory required for KV cache.
Calcula memória necessária para KV cache.
Formula: kv_cache_gb = (kv_cache_mb_per_token * context_tokens * multiplier * mode_buffer) / 1024
Fórmula: kv_cache_gb = (kv_cache_mb_per_token × context_tokens × multiplier × mode_buffer) / 1024
The base KV cache is defined for FP16. For other precisions, we apply
a multiplier: FP32 uses 2x, INT8/INT4 may use less depending on the framework.
O KV cache base é definido para FP16. Para outras precisões, aplicamos
um multiplicador: FP32 usa 2x, INT8/INT4 podem usar menos dependendo do framework.
The calculation mode adds a buffer for production scenarios:
- THEORETICAL: No buffer (ideal minimum, batch=1, no padding)
- CONSERVATIVE: 10% buffer (minimal overhead)
- PRODUCTION: 25% buffer (batch>1, fragmentation, real-world serving)
Args:
kv_cache_mb_per_token: MB per token for the model (FP16 baseline)
context_tokens: Context size in tokens
Returns:
KV cache in GB
"""
multiplier = self.quantization.kv_cache_multiplier
# Production buffer based on calculation mode
# Buffer de produção baseado no modo de cálculo
mode_buffer = {
CalculationMode.THEORETICAL: 1.0, # No extra buffer / Sem buffer extra
CalculationMode.CONSERVATIVE: 1.1, # 10% buffer for overhead
CalculationMode.PRODUCTION: 1.25, # 25% buffer for real-world serving
}.get(self.calculation_mode, 1.0)
kv_cache_mb = kv_cache_mb_per_token * context_tokens * multiplier * mode_buffer
kv_cache_gb = kv_cache_mb / 1024
return kv_cache_gb
[documentos]
def calculate_total_vram(
self,
model: LLMModel,
context_tokens: int,
) -> CalculationBreakdown:
"""Calculate total VRAM required for a model with given context.
Calcula VRAM total necessária para um modelo com contexto dado.
Args:
model: LLM model to evaluate
context_tokens: Context size in tokens
Returns:
CalculationBreakdown with calculation details
"""
# 5.1 Model parameters
# Parâmetros do modelo
params_memory_gb = self.calculate_params_memory(model.params_billion)
# 5.2 Overhead
overhead_gb = self.calculate_overhead(params_memory_gb)
model_with_overhead_gb = params_memory_gb + overhead_gb
# 5.3 KV Cache
kv_cache_gb = self.calculate_kv_cache(
model.kv_cache_mb_per_token,
context_tokens,
)
# 5.4 Total VRAM
# VRAM total
total_vram_gb = model_with_overhead_gb + kv_cache_gb
return CalculationBreakdown(
params_memory_gb=params_memory_gb,
overhead_gb=overhead_gb,
model_with_overhead_gb=model_with_overhead_gb,
kv_cache_gb=kv_cache_gb,
total_vram_gb=total_vram_gb,
)
[documentos]
def evaluate_pair(
self,
model: LLMModel,
gpu: GPU,
context_tokens: int,
quantization: Quantization | None = None,
) -> InferenceResult:
"""Evaluate if a model × GPU pair is viable for the given context.
Avalia se um par modelo × GPU é viável para o contexto dado.
Args:
model: LLM model to evaluate
gpu: GPU to evaluate
context_tokens: Context size in tokens
quantization: Override quantization (uses instance default if None)
Returns:
InferenceResult with status and details
"""
breakdown = self.calculate_total_vram(model, context_tokens)
required_vram = breakdown.total_vram_gb
# 6. Decision logic
# Lógica de decisão
if required_vram <= gpu.vram_gb:
status = Status.RUNS
vram_free_percent = ((gpu.vram_gb - required_vram) / gpu.vram_gb) * 100
# Warning if margin < 10%
# Aviso se margem < 10%
warning = None
if vram_free_percent < (SAFETY_MARGIN_THRESHOLD * 100):
warning = f"Low safety margin ({vram_free_percent:.1f}% free) / Margem de segurança baixa"
else:
status = Status.NOT_RUNS
vram_free_percent = 0.0
warning = None
return InferenceResult(
model_name=model.name,
model_params_billion=model.params_billion,
gpu_name=gpu.name,
gpu_vram_gb=gpu.vram_gb,
required_vram_gb=required_vram,
status=status,
vram_free_percent=vram_free_percent,
quantization=quantization or self.quantization,
warning=warning,
)
[documentos]
def calculate_all_combinations(
self,
context_tokens: int,
models: List[LLMModel] | None = None,
gpus: List[GPU] | None = None,
) -> List[InferenceResult]:
"""Calculate feasibility for all model × GPU combinations.
Calcula viabilidade para todas as combinações modelo × GPU.
Args:
context_tokens: Context size in tokens
models: List of models (uses all if None)
gpus: List of GPUs (uses all if None)
Returns:
List of InferenceResult for all combinations
"""
if models is None:
models = get_all_models()
if gpus is None:
gpus = get_all_gpus()
results = []
for model in models:
for gpu in gpus:
result = self.evaluate_pair(model, gpu, context_tokens)
results.append(result)
return results
[documentos]
def calculate_inference(context_tokens: int) -> dict:
"""Main calculation function (simplified interface).
Função principal de cálculo (interface simplificada).
Args:
context_tokens: Context size in tokens
Returns:
Dictionary with structured results
"""
calculator = VRAMCalculator()
results = calculator.calculate_all_combinations(context_tokens)
return {
"context_tokens": context_tokens,
"quantization": calculator.quantization.value,
"results": [r.to_dict() for r in results],
}
# ============================================================================
# LAYER OFFLOAD CALCULATOR
# CALCULADORA DE OFFLOAD DE CAMADAS
# ============================================================================
[documentos]
@dataclass
class LayerOffloadResult:
"""Result for optimal layer offload calculation.
Resultado para cálculo de offload ótimo de camadas.
Attributes:
total_layers: Total number of layers in the model
layers_on_gpu: Number of layers that fit on GPU
layers_on_cpu: Number of layers that must remain on CPU
gpu_vram_used: VRAM used for GPU layers
cpu_ram_used: System RAM used for CPU layers
offload_ratio: Ratio of layers on GPU (0.0 to 1.0)
performance_impact: Estimated performance impact (0-100% slower)
recommended_gpu_split: Recommended --gpu-layers parameter for llama.cpp
"""
total_layers: int
layers_on_gpu: int
layers_on_cpu: int
gpu_vram_used: float # GB
cpu_ram_used: float # GB
offload_ratio: float
performance_impact: float # Percentage slower
recommended_gpu_split: str
status: str # "full_gpu", "partial_offload", "cpu_only"
[documentos]
class LayerOffloadCalculator:
"""Calculates optimal layer offload configuration for hybrid GPU+CPU inference.
Calcula configuração ótima de offload de camadas para inferência híbrida GPU+CPU.
This calculator helps determine how many transformer layers can fit in GPU VRAM
for scenarios where the full model doesn't fit, enabling partial offloading
strategies used by llama.cpp, AutoGPTQ, and other frameworks.
Esta calculadora ajuda a determinar quantas camadas transformer cabem na VRAM
para cenários onde o modelo completo não cabe, permitindo estratégias de offload
parcial usadas por llama.cpp, AutoGPTQ e outros frameworks.
"""
[documentos]
def __init__(
self,
quantization: Quantization = Quantization.FP16,
safety_margin_gb: float = 1.0,
model_format: ModelFormat = ModelFormat.FP16,
):
"""Initialize the layer offload calculator.
Inicializa a calculadora de offload de camadas.
Args:
quantization: Quantization type for layer size calculation
safety_margin_gb: Safety margin in GB to reserve
model_format: Model format for overhead calculation
"""
self.quantization = quantization
self.safety_margin_gb = safety_margin_gb
self.model_format = model_format
[documentos]
def estimate_layer_size_gb(
self,
model: LLMModel,
) -> float:
"""Estimate the VRAM size of a single transformer layer.
Estima o tamanho VRAM de uma única camada transformer.
Args:
model: LLM model to analyze
Returns:
Estimated size of one layer in GB
Formula:
layer_size = (model_params / num_layers) * bytes_per_param * format_overhead
"""
num_layers = model.estimated_layers
params_per_layer = model.params_billion / num_layers
bytes_per_param = BYTES_PER_PARAM[self.quantization]
format_multiplier = FORMAT_OVERHEAD.get(self.model_format, 1.0)
# Layer size in GB (params_billion is already in "GB worth of bytes at 1 byte/param")
layer_size_gb = params_per_layer * bytes_per_param * format_multiplier
return layer_size_gb
[documentos]
def calculate_kv_cache_memory(
self,
model: LLMModel,
context_tokens: int,
) -> float:
"""Calculate KV cache memory requirement.
Calcula requisito de memória de KV cache.
Args:
model: LLM model
context_tokens: Context size in tokens
Returns:
KV cache memory in GB
"""
kv_cache_mb = model.kv_cache_mb_per_token * context_tokens
kv_cache_gb = kv_cache_mb / 1024
return kv_cache_gb
[documentos]
def calculate_optimal_offload(
self,
model: LLMModel,
gpu: GPU,
context_tokens: int,
) -> LayerOffloadResult:
"""Calculate optimal layer offload configuration.
Calcula configuração ótima de offload de camadas.
Args:
model: LLM model to analyze
gpu: GPU to use for offload
context_tokens: Context size in tokens
Returns:
LayerOffloadResult with optimal configuration
"""
total_layers = model.estimated_layers
layer_size_gb = self.estimate_layer_size_gb(model)
# Calculate available VRAM for layers
# Available = Total VRAM - KV cache - overhead - safety margin
kv_cache_gb = self.calculate_kv_cache_memory(model, context_tokens)
# Runtime overhead (activations, buffers)
params_memory_gb = model.params_billion * BYTES_PER_PARAM[self.quantization]
overhead_gb = params_memory_gb * OVERHEAD_FACTOR
# VRAM available for layers
available_for_layers = gpu.vram_gb - kv_cache_gb - overhead_gb - self.safety_margin_gb
# Calculate how many layers fit on GPU
if available_for_layers <= 0:
# Not enough VRAM even for KV cache
layers_on_gpu = 0
else:
layers_on_gpu = min(total_layers, int(available_for_layers / layer_size_gb))
layers_on_cpu = total_layers - layers_on_gpu
offload_ratio = layers_on_gpu / total_layers if total_layers > 0 else 0
# Calculate memory usage
gpu_vram_used = layers_on_gpu * layer_size_gb + kv_cache_gb + overhead_gb
cpu_ram_used = layers_on_cpu * layer_size_gb
# Estimate performance impact
# Full GPU: 0% impact
# Partial offload: 2-5% slower per 10% of layers on CPU (PCIe bottleneck)
# CPU only: ~10-20x slower (very rough estimate)
if layers_on_gpu == total_layers:
performance_impact = 0.0
status = "full_gpu"
elif layers_on_gpu == 0:
performance_impact = 1000.0 # ~10x slower = 1000% slower
status = "cpu_only"
else:
# Estimate: 3% slower per 10% of layers on CPU
cpu_ratio = layers_on_cpu / total_layers
performance_impact = cpu_ratio * 30.0 # 30% per full offload equivalent
status = "partial_offload"
# Recommended llama.cpp --gpu-layers parameter
if layers_on_gpu == 0:
recommended = "0 (CPU only)"
elif layers_on_gpu == total_layers:
recommended = f"{layers_on_gpu} (all layers, full GPU)"
else:
recommended = f"{layers_on_gpu} (of {total_layers} total)"
return LayerOffloadResult(
total_layers=total_layers,
layers_on_gpu=layers_on_gpu,
layers_on_cpu=layers_on_cpu,
gpu_vram_used=gpu_vram_used,
cpu_ram_used=cpu_ram_used,
offload_ratio=offload_ratio,
performance_impact=performance_impact,
recommended_gpu_split=recommended,
status=status,
)
# ============================================================================
# CPU OFFLOAD CALCULATOR
# CALCULADORA DE OFFLOAD DE CPU
# ============================================================================
[documentos]
@dataclass
class PCIeConfig:
"""PCIe bandwidth configuration.
Configuração de largura de banda PCIe.
Attributes:
generation: PCIe generation (3.0, 4.0, 5.0)
bandwidth_gb_s: Theoretical bandwidth in GB/s (per lane x16)
lanes: Number of lanes (typically x16 for GPUs)
"""
generation: str
bandwidth_gb_s: float
lanes: int = 16
@property
def effective_bandwidth_gb_s(self) -> float:
"""Effective bandwidth accounting for protocol overhead.
Largura de banda efetiva considerando overhead de protocolo.
"""
# Real-world effective bandwidth is ~70-80% of theoretical
return self.bandwidth_gb_s * 0.75
# PCIe bandwidth specifications (x16)
# Especificações de largura de banda PCIe (x16)
PCIE_CONFIGS: Dict[str, PCIeConfig] = {
"3.0": PCIeConfig(generation="3.0", bandwidth_gb_s=16.0, lanes=16), # ~16 GB/s theoretical
"4.0": PCIeConfig(generation="4.0", bandwidth_gb_s=32.0, lanes=16), # ~32 GB/s theoretical
"5.0": PCIeConfig(generation="5.0", bandwidth_gb_s=64.0, lanes=16), # ~64 GB/s theoretical
}
[documentos]
@dataclass
class CPUOffloadResult:
"""Result for CPU offload calculation.
Resultado para cálculo de offload de CPU.
Attributes:
system_ram_required: Total system RAM required in GB
system_ram_available: System RAM available in GB
fits_in_ram: Whether the model fits in system RAM
offload_config: Layer offload configuration
pcie_generation: PCIe generation used
estimated_token_speed: Estimated tokens/second with offload
speed_vs_full_gpu: Speed ratio vs full GPU (0.0 to 1.0)
"""
system_ram_required: float # GB
system_ram_available: float # GB
fits_in_ram: bool
offload_config: LayerOffloadResult
pcie_generation: str
estimated_token_speed: float # tokens/second
speed_vs_full_gpu: float # Ratio (0.0 to 1.0)
[documentos]
class CPUOffloadCalculator:
"""Calculates hybrid GPU+CPU inference configurations.
Calcula configurações de inferência híbrida GPU+CPU.
This calculator helps determine:
- How much system RAM is needed for CPU offload
- Performance impact based on PCIe bandwidth
- Optimal layer distribution between GPU and CPU
Esta calculadora ajuda a determinar:
- Quanta RAM do sistema é necessária para offload de CPU
- Impacto de performance baseado em largura de banda PCIe
- Distribuição ótima de camadas entre GPU e CPU
"""
[documentos]
def __init__(
self,
quantization: Quantization = Quantization.FP16,
system_ram_gb: float = 32.0,
pcie_generation: str = "4.0",
model_format: ModelFormat = ModelFormat.FP16,
):
"""Initialize the CPU offload calculator.
Inicializa a calculadora de offload de CPU.
Args:
quantization: Quantization type
system_ram_gb: Available system RAM in GB
pcie_generation: PCIe generation (3.0, 4.0, 5.0)
model_format: Model format
"""
self.quantization = quantization
self.system_ram_gb = system_ram_gb
self.pcie_config = PCIE_CONFIGS.get(pcie_generation, PCIE_CONFIGS["4.0"])
self.model_format = model_format
self.layer_calculator = LayerOffloadCalculator(
quantization=quantization,
model_format=model_format,
)
[documentos]
def calculate_total_model_memory(self, model: LLMModel) -> float:
"""Calculate total model memory requirement.
Calcula requisito total de memória do modelo.
Args:
model: LLM model
Returns:
Total memory in GB
"""
bytes_per_param = BYTES_PER_PARAM[self.quantization]
format_multiplier = FORMAT_OVERHEAD.get(self.model_format, 1.0)
# Base parameter memory
params_memory = model.params_billion * bytes_per_param * format_multiplier
# Add overhead for runtime
overhead = params_memory * OVERHEAD_FACTOR
return params_memory + overhead
[documentos]
def calculate_offload(
self,
model: LLMModel,
gpu: GPU,
context_tokens: int,
) -> CPUOffloadResult:
"""Calculate CPU offload configuration.
Calcula configuração de offload de CPU.
Args:
model: LLM model
gpu: GPU to use
context_tokens: Context size in tokens
Returns:
CPUOffloadResult with configuration details
"""
# Get layer offload configuration
offload_config = self.layer_calculator.calculate_optimal_offload(
model, gpu, context_tokens
)
# Calculate system RAM requirement
# CPU layers + any additional runtime overhead on CPU
total_memory = self.calculate_total_model_memory(model)
system_ram_required = offload_config.cpu_ram_used + 2.0 # +2GB for CPU runtime overhead
fits_in_ram = system_ram_required <= self.system_ram_gb
# Estimate performance
estimated_speed, speed_ratio = self.estimate_offload_performance(
offload_config, model
)
return CPUOffloadResult(
system_ram_required=system_ram_required,
system_ram_available=self.system_ram_gb,
fits_in_ram=fits_in_ram,
offload_config=offload_config,
pcie_generation=self.pcie_config.generation,
estimated_token_speed=estimated_speed,
speed_vs_full_gpu=speed_ratio,
)