{
  "benchmarkName": "8GB VRAM local LLM GPU fit test",
  "testedAt": "2026-06-17T22:26:16.232Z",
  "hostGpu": "NVIDIA GeForce RTX 5090, Driver 595.79, CUDA 13.2, 32607 MiB total VRAM",
  "note": "Measured on an RTX 5090 host and filtered by peak GPU memory delta to judge 8GB VRAM fit. This is not a native RTX 4060 8GB card test.",
  "ollamaVersion": "ollama version is 0.30.5",
  "rawNvidiaSmi": "Wed Jun 17 17:25:22 2026       \r\n+-----------------------------------------------------------------------------------------+\r\n| NVIDIA-SMI 595.79                 Driver Version: 595.79         CUDA Version: 13.2     |\r\n+-----------------------------------------+------------------------+----------------------+\r\n| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |\r\n| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |\r\n|                                         |                        |               MIG M. |\r\n|=========================================+========================+======================|\r\n|   0  NVIDIA GeForce RTX 5090      WDDM  |   00000000:01:00.0  On |                  N/A |\r\n|  0%   50C    P0             60W /  575W |    4530MiB /  32607MiB |      5%      Default |\r\n|                                         |                        |                  N/A |\r\n+-----------------------------------------+------------------------+----------------------+\r\n\r\n+-----------------------------------------------------------------------------------------+\r\n| Processes:                                                                              |\r\n|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |\r\n|        ID   ID                                                               Usage      |\r\n|=========================================================================================|\r\n|    0   N/A  N/A            2364    C+G   C:\\Windows\\explorer.exe               N/A      |\r\n|    0   N/A  N/A            2408    C+G   ....0.4022.69\\msedgewebview2.exe      N/A      |\r\n|    0   N/A  N/A            4656    C+G   ...__2p2nqsd0c76g0\\app\\Codex.exe      N/A      |\r\n|    0   N/A  N/A            7904    C+G   ....0.4022.69\\msedgewebview2.exe      N/A      |\r\n|    0   N/A  N/A            8216    C+G   ...Chrome\\Application\\chrome.exe      N/A      |\r\n|    0   N/A  N/A            8776    C+G   ...2txyewy\\CrossDeviceResume.exe      N/A      |\r\n|    0   N/A  N/A            8824    C+G   ...em32\\ApplicationFrameHost.exe      N/A      |\r\n|    0   N/A  N/A            9044    C+G   ...ntrolPanel\\SystemSettings.exe      N/A      |\r\n|    0   N/A  N/A            9536    C+G   ....0.4022.69\\msedgewebview2.exe      N/A      |\r\n|    0   N/A  N/A           10596    C+G   ...y\\StartMenuExperienceHost.exe      N/A      |\r\n|    0   N/A  N/A           10604    C+G   ..._cw5n1h2txyewy\\SearchHost.exe      N/A      |\r\n|    0   N/A  N/A           11316    C+G   ...xyewy\\ShellExperienceHost.exe      N/A      |\r\n|    0   N/A  N/A           12252    C+G   ...acted\\runtime\\WeChatAppEx.exe      N/A      |\r\n|    0   N/A  N/A           13904    C+G   ...App_cw5n1h2txyewy\\LockApp.exe      N/A      |\r\n|    0   N/A  N/A           14964    C+G   ...IA App\\CEF\\NVIDIA Overlay.exe      N/A      |\r\n|    0   N/A  N/A           15828    C+G   ...Local\\PowerToys\\PowerToys.exe      N/A      |\r\n|    0   N/A  N/A           16372    C+G   ....0.4022.69\\msedgewebview2.exe      N/A      |\r\n|    0   N/A  N/A           16956    C+G   ...IA App\\CEF\\NVIDIA Overlay.exe      N/A      |\r\n|    0   N/A  N/A           18604    C+G   ...pps\\PowerToys.QuickAccess.exe      N/A      |\r\n|    0   N/A  N/A           19200    C+G   ...yb3d8bbwe\\WindowsTerminal.exe      N/A      |\r\n|    0   N/A  N/A           19484    C+G   ...5n1h2txyewy\\TextInputHost.exe      N/A      |\r\n|    0   N/A  N/A           20388    C+G   ...UI3Apps\\PowerToys.Peek.UI.exe      N/A      |\r\n|    0   N/A  N/A           20992    C+G   ...s\\PowerToys.ColorPickerUI.exe      N/A      |\r\n|    0   N/A  N/A           21048    C+G   ...Toys\\PowerToys.FancyZones.exe      N/A      |\r\n|    0   N/A  N/A           21160    C+G   ...8bbwe\\Microsoft.CmdPal.UI.exe      N/A      |\r\n|    0   N/A  N/A           21180    C+G   ...Chrome\\Application\\chrome.exe      N/A      |\r\n|    0   N/A  N/A           21720    C+G   ...les\\Tencent\\Weixin\\Weixin.exe      N/A      |\r\n|    0   N/A  N/A           22172    C+G   ...Next\\CNext\\RadeonSoftware.exe      N/A      |\r\n|    0   N/A  N/A           22512    C+G   ...D\\CNext\\CNext\\AMDRSSrcExt.exe      N/A      |\r\n|    0   N/A  N/A           26004    C+G   ....0.4022.69\\msedgewebview2.exe      N/A      |\r\n|    0   N/A  N/A           26460    C+G   ...__8yrtsj140pw4g\\app\\Slack.exe      N/A      |\r\n|    0   N/A  N/A           26724    C+G   ...__8yrtsj140pw4g\\app\\Slack.exe      N/A      |\r\n|    0   N/A  N/A           27748    C+G   ....0.4022.69\\msedgewebview2.exe      N/A      |\r\n|    0   N/A  N/A           28052    C+G   ...8wekyb3d8bbwe\\M365Copilot.exe      N/A      |\r\n|    0   N/A  N/A           29604    C+G   ....0.4022.69\\msedgewebview2.exe      N/A      |\r\n|    0   N/A  N/A           29992    C+G   ...__2p2nqsd0c76g0\\app\\Codex.exe      N/A      |\r\n|    0   N/A  N/A           34792    C+G   ...yb3d8bbwe\\Notepad\\Notepad.exe      N/A      |\r\n|    0   N/A  N/A           38780    C+G   ...indows\\System32\\ShellHost.exe      N/A      |\r\n|    0   N/A  N/A           41384    C+G   ...Chrome\\Application\\chrome.exe      N/A      |\r\n+-----------------------------------------------------------------------------------------+\r\n",
  "ollamaList": "NAME                                       ID              SIZE      MODIFIED    \ngemma4:12b                                 4eb23ef187e2    7.6 GB    12 days ago    \ndeepvlt-embedding:latest                   ac6da0dfba84    639 MB    2 weeks ago    \nqwen3-embedding:0.6b                       ac6da0dfba84    639 MB    2 weeks ago    \nguozhennianhua/deepvault-embedding:4int    bee465912914    396 MB    3 weeks ago    \ndeepvault-embedding:4int                   bee465912914    396 MB    3 weeks ago    \ndeepvault-embedding:latest                 081e0e987291    639 MB    3 weeks ago    \ndeepvault-qwen3:latest                     f896faa1f973    2.5 GB    3 weeks ago    \nqwen3-vl:8b                                901cae732162    6.1 GB    6 weeks ago    \nqwen3.6:27b                                a50eda8ed977    17 GB     7 weeks ago    \nqwen3.5:9b                                 6488c96fa5fa    6.6 GB    7 weeks ago    \nqwen3:4b                                   359d7dd4bcda    2.5 GB    7 weeks ago    \n",
  "results": [
    {
      "model": "qwen3:4b",
      "label": "Small daily assistant",
      "prompt": "Write one practical paragraph for a developer choosing a local LLM for an 8GB VRAM GPU. Mention model size, context length, and why the biggest model is not always the best choice.",
      "options": {
        "num_ctx": 2048,
        "num_predict": 160,
        "temperature": 0
      },
      "startedAt": "2026-06-17T22:25:25.582Z",
      "wallSeconds": 2.188,
      "baselineGpuMiB": 4533,
      "peakGpuMiB": 7776,
      "peakDeltaMiB": 3243,
      "afterStopGpuMiB": 4540,
      "evalCount": 160,
      "evalDurationNs": 501566000,
      "tokensPerSecond": 319.0008892149787,
      "totalDurationNs": 2162720500,
      "loadDurationNs": 1604569400,
      "promptEvalCount": 50,
      "promptEvalDurationNs": 54199000,
      "doneReason": "length",
      "responsePreview": "We are writing a practical paragraph for a developer who has an 8GB VRAM GPU and is choosing a local LLM.\n Key points to cover:\n   - Model size (in terms of parameters or maybe in terms of file size? but typically we talk about parameters for model size)\n   - Context length (the maximum number of tokens the model can handle in one input)\n   - Why the biggest model is not always the best choice (for 8GB VRAM)\n\n Important: The biggest model that fits in 8GB VRAM might be a model that is too large ",
      "thinkingPreview": "",
      "error": null,
      "samples": [
        {
          "atMs": 285,
          "usedMiB": 4533
        },
        {
          "atMs": 538,
          "usedMiB": 4963
        },
        {
          "atMs": 804,
          "usedMiB": 4962
        },
        {
          "atMs": 1060,
          "usedMiB": 7338
        },
        {
          "atMs": 1317,
          "usedMiB": 7338
        },
        {
          "atMs": 1567,
          "usedMiB": 7771
        },
        {
          "atMs": 1829,
          "usedMiB": 7775
        },
        {
          "atMs": 2101,
          "usedMiB": 7776
        }
      ]
    },
    {
      "model": "qwen3-vl:8b",
      "label": "Vision-capable stretch",
      "prompt": "Write one practical paragraph for a developer choosing a local LLM for an 8GB VRAM GPU. Mention model size, context length, and why the biggest model is not always the best choice.",
      "options": {
        "num_ctx": 2048,
        "num_predict": 160,
        "temperature": 0
      },
      "startedAt": "2026-06-17T22:25:34.572Z",
      "wallSeconds": 6.529,
      "baselineGpuMiB": 4580,
      "peakGpuMiB": 11830,
      "peakDeltaMiB": 7250,
      "afterStopGpuMiB": 4406,
      "evalCount": 160,
      "evalDurationNs": 741981000,
      "tokensPerSecond": 215.6389449325522,
      "totalDurationNs": 6524965700,
      "loadDurationNs": 5690516200,
      "promptEvalCount": 48,
      "promptEvalDurationNs": 84887000,
      "doneReason": "length",
      "responsePreview": "",
      "thinkingPreview": "<think>\nOkay, the user wants a practical paragraph for developers picking a local LLM with an 8GB VRAM GPU. They specifically asked about model size, context length, and why bigger isn't always better. \n\nHmm, this seems like a technical query from someone building or optimizing an AI application. Probably a developer who's hit the \"VRAM wall\" when trying to run large models. They're likely frustrated by tutorials that just say \"use Llama 3 70B\" without considering hardware limits. Smart of them ",
      "error": null,
      "samples": [
        {
          "atMs": 295,
          "usedMiB": 4580
        },
        {
          "atMs": 555,
          "usedMiB": 4603
        },
        {
          "atMs": 821,
          "usedMiB": 5012
        },
        {
          "atMs": 1073,
          "usedMiB": 6430
        },
        {
          "atMs": 1324,
          "usedMiB": 5071
        },
        {
          "atMs": 1585,
          "usedMiB": 5083
        },
        {
          "atMs": 1859,
          "usedMiB": 9551
        },
        {
          "atMs": 2101,
          "usedMiB": 9551
        },
        {
          "atMs": 2368,
          "usedMiB": 9551
        },
        {
          "atMs": 2628,
          "usedMiB": 9547
        },
        {
          "atMs": 2883,
          "usedMiB": 9547
        },
        {
          "atMs": 3141,
          "usedMiB": 9473
        },
        {
          "atMs": 3400,
          "usedMiB": 9549
        },
        {
          "atMs": 3652,
          "usedMiB": 9557
        },
        {
          "atMs": 3924,
          "usedMiB": 9558
        },
        {
          "atMs": 4181,
          "usedMiB": 9567
        },
        {
          "atMs": 4441,
          "usedMiB": 9567
        },
        {
          "atMs": 4691,
          "usedMiB": 9568
        },
        {
          "atMs": 4951,
          "usedMiB": 9984
        },
        {
          "atMs": 5200,
          "usedMiB": 11458
        },
        {
          "atMs": 5461,
          "usedMiB": 11455
        },
        {
          "atMs": 5717,
          "usedMiB": 11826
        },
        {
          "atMs": 5974,
          "usedMiB": 11830
        },
        {
          "atMs": 6227,
          "usedMiB": 11823
        },
        {
          "atMs": 6489,
          "usedMiB": 11814
        }
      ]
    },
    {
      "model": "qwen3.5:9b",
      "label": "Upper text range",
      "prompt": "Write one practical paragraph for a developer choosing a local LLM for an 8GB VRAM GPU. Mention model size, context length, and why the biggest model is not always the best choice.",
      "options": {
        "num_ctx": 2048,
        "num_predict": 160,
        "temperature": 0
      },
      "startedAt": "2026-06-17T22:25:47.856Z",
      "wallSeconds": 7.891,
      "baselineGpuMiB": 4430,
      "peakGpuMiB": 11094,
      "peakDeltaMiB": 6664,
      "afterStopGpuMiB": 4221,
      "evalCount": 160,
      "evalDurationNs": 922301000,
      "tokensPerSecond": 173.47915702140622,
      "totalDurationNs": 7886453000,
      "loadDurationNs": 6834134000,
      "promptEvalCount": 52,
      "promptEvalDurationNs": 122909000,
      "doneReason": "length",
      "responsePreview": "When selecting a local Large Language Model (LLM) for an 8GB VRAM GPU like an RTX 4070 or similar consumer card, you should prioritize quantized models in the **1B to 3B parameter range** running at Q4_K_M precision, as these fit comfortably within your memory budget while leaving room for a context window of roughly **2k–8k tokens**; attempting to load larger models like Llama-3-70B or even mid-sized variants often forces you into aggressive quantization (Q2/Q3) that degrades reasoning capabili",
      "thinkingPreview": "",
      "error": null,
      "samples": [
        {
          "atMs": 297,
          "usedMiB": 4217
        },
        {
          "atMs": 546,
          "usedMiB": 4210
        },
        {
          "atMs": 800,
          "usedMiB": 4202
        },
        {
          "atMs": 1062,
          "usedMiB": 5681
        },
        {
          "atMs": 1313,
          "usedMiB": 5699
        },
        {
          "atMs": 1568,
          "usedMiB": 5714
        },
        {
          "atMs": 1823,
          "usedMiB": 4700
        },
        {
          "atMs": 2087,
          "usedMiB": 4700
        },
        {
          "atMs": 2339,
          "usedMiB": 4700
        },
        {
          "atMs": 2601,
          "usedMiB": 9549
        },
        {
          "atMs": 2857,
          "usedMiB": 9550
        },
        {
          "atMs": 3128,
          "usedMiB": 9547
        },
        {
          "atMs": 3379,
          "usedMiB": 9547
        },
        {
          "atMs": 3629,
          "usedMiB": 9521
        },
        {
          "atMs": 3907,
          "usedMiB": 9509
        },
        {
          "atMs": 4154,
          "usedMiB": 9507
        },
        {
          "atMs": 4415,
          "usedMiB": 9507
        },
        {
          "atMs": 4672,
          "usedMiB": 9509
        },
        {
          "atMs": 4933,
          "usedMiB": 9509
        },
        {
          "atMs": 5188,
          "usedMiB": 9543
        },
        {
          "atMs": 5453,
          "usedMiB": 9547
        },
        {
          "atMs": 5718,
          "usedMiB": 9663
        },
        {
          "atMs": 5977,
          "usedMiB": 10831
        },
        {
          "atMs": 6230,
          "usedMiB": 10830
        },
        {
          "atMs": 6485,
          "usedMiB": 10826
        },
        {
          "atMs": 6744,
          "usedMiB": 11088
        },
        {
          "atMs": 7002,
          "usedMiB": 11094
        },
        {
          "atMs": 7257,
          "usedMiB": 11094
        },
        {
          "atMs": 7518,
          "usedMiB": 11089
        },
        {
          "atMs": 7768,
          "usedMiB": 11089
        }
      ]
    },
    {
      "model": "gemma4:12b",
      "label": "8GB boundary test",
      "prompt": "Write one practical paragraph for a developer choosing a local LLM for an 8GB VRAM GPU. Mention model size, context length, and why the biggest model is not always the best choice.",
      "options": {
        "num_ctx": 2048,
        "num_predict": 160,
        "temperature": 0
      },
      "startedAt": "2026-06-17T22:26:02.547Z",
      "wallSeconds": 9.566,
      "baselineGpuMiB": 4204,
      "peakGpuMiB": 12753,
      "peakDeltaMiB": 8549,
      "afterStopGpuMiB": 4220,
      "evalCount": 160,
      "evalDurationNs": 1477915000,
      "tokensPerSecond": 108.26062391950822,
      "totalDurationNs": 9562880700,
      "loadDurationNs": 7930230000,
      "promptEvalCount": 53,
      "promptEvalDurationNs": 147661000,
      "doneReason": "length",
      "responsePreview": "When selecting a model for an 8GB VRAM GPU, you should prioritize efficiency by targeting models in the **7B to 11B parameter range**, specifically those optimized with 4-bit or 5-bit quantization (e.g., Mistral 7B or Llama 3 8B). While it is tempting to chase larger parameters, the \"biggest\" model is not always the best choice because exceeding your VRAM limit forces the system to offload layers to system RAM, causing a massive drop in inference speed and potential crashes. Furthermore, a small",
      "thinkingPreview": "",
      "error": null,
      "samples": [
        {
          "atMs": 287,
          "usedMiB": 4204
        },
        {
          "atMs": 539,
          "usedMiB": 4553
        },
        {
          "atMs": 789,
          "usedMiB": 4204
        },
        {
          "atMs": 1062,
          "usedMiB": 4799
        },
        {
          "atMs": 1312,
          "usedMiB": 4635
        },
        {
          "atMs": 1556,
          "usedMiB": 4635
        },
        {
          "atMs": 1811,
          "usedMiB": 4635
        },
        {
          "atMs": 2066,
          "usedMiB": 11657
        },
        {
          "atMs": 2340,
          "usedMiB": 11657
        },
        {
          "atMs": 2589,
          "usedMiB": 11662
        },
        {
          "atMs": 2849,
          "usedMiB": 11662
        },
        {
          "atMs": 3112,
          "usedMiB": 11662
        },
        {
          "atMs": 3361,
          "usedMiB": 11665
        },
        {
          "atMs": 3628,
          "usedMiB": 11665
        },
        {
          "atMs": 3881,
          "usedMiB": 11665
        },
        {
          "atMs": 4141,
          "usedMiB": 11665
        },
        {
          "atMs": 4395,
          "usedMiB": 11664
        },
        {
          "atMs": 4656,
          "usedMiB": 11665
        },
        {
          "atMs": 4915,
          "usedMiB": 11659
        },
        {
          "atMs": 5172,
          "usedMiB": 11659
        },
        {
          "atMs": 5445,
          "usedMiB": 11659
        },
        {
          "atMs": 5688,
          "usedMiB": 11659
        },
        {
          "atMs": 5945,
          "usedMiB": 11654
        },
        {
          "atMs": 6193,
          "usedMiB": 11654
        },
        {
          "atMs": 6451,
          "usedMiB": 11654
        },
        {
          "atMs": 6706,
          "usedMiB": 11654
        },
        {
          "atMs": 6961,
          "usedMiB": 11654
        },
        {
          "atMs": 7230,
          "usedMiB": 12300
        },
        {
          "atMs": 7490,
          "usedMiB": 12547
        },
        {
          "atMs": 7736,
          "usedMiB": 12736
        },
        {
          "atMs": 8002,
          "usedMiB": 12739
        },
        {
          "atMs": 8246,
          "usedMiB": 12751
        },
        {
          "atMs": 8500,
          "usedMiB": 12752
        },
        {
          "atMs": 8765,
          "usedMiB": 12753
        },
        {
          "atMs": 9006,
          "usedMiB": 12751
        },
        {
          "atMs": 9260,
          "usedMiB": 12751
        },
        {
          "atMs": 9521,
          "usedMiB": 12749
        }
      ]
    }
  ]
}