quantest/quantest.go at main · sammcj/quantest · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
package quantest

import (
	"fmt"
	"strings"
)

// Version can be set at build time
var Version string

func GetModelConfig(modelName string) (ModelConfig, error) {
	if strings.Contains(modelName, ":") {
		return GetOllamaModelConfig(modelName)
	}
	return GetHFModelConfig(modelName)
}

func EstimateVRAMForModel(modelName string, vram float64, contextSize int, quantLevel, kvQuant string) (*VRAMEstimation, error) {

	modelConfig, err := GetModelConfig(modelName)
	if err != nil {
		return nil, fmt.Errorf("error getting model config: %w", err)
	}

  // If quantLevel is not provided, and it's an ollama model, check the model's config
  if quantLevel == "" && modelConfig.IsOllama {
    quantLevel = modelConfig.QuantLevel
  } else if quantLevel == "" {
    fmt.Println("Quant level not provided, and model is not an Ollama model. Defaulting to q4_k_m...")
    quantLevel = "q4_k_m"
  }

	// Parse BPW from quantLevel
	bpw, err := ParseBPWOrQuant(quantLevel)
	if err != nil {
		return nil, fmt.Errorf("error parsing quantisation level: %w", err)
	}

	// Calculate VRAM usage
	estimatedVRAM, err := CalculateVRAM(modelConfig, bpw, contextSize, KVCacheQuantisation(kvQuant))
	if err != nil {
		return nil, fmt.Errorf("error calculating VRAM: %w", err)
	}

	// Calculate maximum context size
	maxContextSize, err := CalculateContext(modelConfig, vram, bpw, KVCacheQuantisation(kvQuant))
	if err != nil {
		maxContextSize = 0 // Set to 0 if calculation fails
	}

	// Calculate best BPW
	bestBPW, recommendations, err := CalculateBPW(modelConfig, vram, contextSize, KVCacheQuantisation(kvQuant), "gguf")
	if err != nil {
		bestBPW = "Unknown"
		recommendations = QuantRecommendations{Recommendations: make(map[int]string)}
	}

	return &VRAMEstimation{
		ModelName:       modelName,
		ModelConfig:     modelConfig, // Add this line
		ContextSize:     contextSize,
		KVCacheQuant:    KVCacheQuantisation(kvQuant),
		AvailableVRAM:   vram,
		QuantLevel:      quantLevel,
		EstimatedVRAM:   estimatedVRAM,
		FitsAvailable:   estimatedVRAM <= vram,
		MaxContextSize:  maxContextSize,
		MaximumQuant:    fmt.Sprintf("%v", bestBPW),
		Recommendations: recommendations.Recommendations,
	}, nil
}