main: build = 2998 (9588f196) main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu main: seed = 1716674713 llama_model_loader: loaded meta data with 28 key-value pairs and 322 tensors from aya-23-35B-IMat-GGUF/aya-23-35B.gguf (version GGUF V3 (latest)) llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. llama_model_loader: - kv 0: general.architecture str = command-r llama_model_loader: - kv 1: general.name str = aya-23-35B llama_model_loader: - kv 2: command-r.block_count u32 = 40 llama_model_loader: - kv 3: command-r.context_length u32 = 8192 llama_model_loader: - kv 4: command-r.embedding_length u32 = 8192 llama_model_loader: - kv 5: command-r.feed_forward_length u32 = 22528 llama_model_loader: - kv 6: command-r.attention.head_count u32 = 64 llama_model_loader: - kv 7: command-r.attention.head_count_kv u32 = 64 llama_model_loader: - kv 8: command-r.rope.freq_base f32 = 8000000.000000 llama_model_loader: - kv 9: command-r.attention.layer_norm_epsilon f32 = 0.000010 llama_model_loader: - kv 10: general.file_type u32 = 1 llama_model_loader: - kv 11: command-r.logit_scale f32 = 0.062500 llama_model_loader: - kv 12: command-r.rope.scaling.type str = none llama_model_loader: - kv 13: tokenizer.ggml.model str = gpt2 llama_model_loader: - kv 14: tokenizer.ggml.pre str = command-r llama_model_loader: - kv 15: tokenizer.ggml.tokens arr[str,256000] = ["", "", "", "", ... llama_model_loader: - kv 16: tokenizer.ggml.token_type arr[i32,256000] = [3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, ... llama_model_loader: - kv 17: tokenizer.ggml.merges arr[str,253333] = ["Ġ Ġ", "Ġ t", "e r", "i n", "Ġ a... llama_model_loader: - kv 18: tokenizer.ggml.bos_token_id u32 = 5 llama_model_loader: - kv 19: tokenizer.ggml.eos_token_id u32 = 255001 llama_model_loader: - kv 20: tokenizer.ggml.padding_token_id u32 = 0 llama_model_loader: - kv 21: tokenizer.ggml.add_bos_token bool = true llama_model_loader: - kv 22: tokenizer.ggml.add_eos_token bool = false llama_model_loader: - kv 23: tokenizer.chat_template.tool_use str = {{ bos_token }}{% if messages[0]['rol... llama_model_loader: - kv 24: tokenizer.chat_template.rag str = {{ bos_token }}{% if messages[0]['rol... llama_model_loader: - kv 25: tokenizer.chat_templates arr[str,2] = ["rag", "tool_use"] llama_model_loader: - kv 26: tokenizer.chat_template str = {{ bos_token }}{% if messages[0]['rol... llama_model_loader: - kv 27: general.quantization_version u32 = 2 llama_model_loader: - type f32: 41 tensors llama_model_loader: - type f16: 281 tensors llm_load_vocab: special tokens definition check successful ( 1008/256000 ). llm_load_print_meta: format = GGUF V3 (latest) llm_load_print_meta: arch = command-r llm_load_print_meta: vocab type = BPE llm_load_print_meta: n_vocab = 256000 llm_load_print_meta: n_merges = 253333 llm_load_print_meta: n_ctx_train = 8192 llm_load_print_meta: n_embd = 8192 llm_load_print_meta: n_head = 64 llm_load_print_meta: n_head_kv = 64 llm_load_print_meta: n_layer = 40 llm_load_print_meta: n_rot = 128 llm_load_print_meta: n_embd_head_k = 128 llm_load_print_meta: n_embd_head_v = 128 llm_load_print_meta: n_gqa = 1 llm_load_print_meta: n_embd_k_gqa = 8192 llm_load_print_meta: n_embd_v_gqa = 8192 llm_load_print_meta: f_norm_eps = 1.0e-05 llm_load_print_meta: f_norm_rms_eps = 0.0e+00 llm_load_print_meta: f_clamp_kqv = 0.0e+00 llm_load_print_meta: f_max_alibi_bias = 0.0e+00 llm_load_print_meta: f_logit_scale = 6.2e-02 llm_load_print_meta: n_ff = 22528 llm_load_print_meta: n_expert = 0 llm_load_print_meta: n_expert_used = 0 llm_load_print_meta: causal attn = 1 llm_load_print_meta: pooling type = 0 llm_load_print_meta: rope type = 0 llm_load_print_meta: rope scaling = none llm_load_print_meta: freq_base_train = 8000000.0 llm_load_print_meta: freq_scale_train = 1 llm_load_print_meta: n_yarn_orig_ctx = 8192 llm_load_print_meta: rope_finetuned = unknown llm_load_print_meta: ssm_d_conv = 0 llm_load_print_meta: ssm_d_inner = 0 llm_load_print_meta: ssm_d_state = 0 llm_load_print_meta: ssm_dt_rank = 0 llm_load_print_meta: model type = 35B llm_load_print_meta: model ftype = F16 llm_load_print_meta: model params = 34.98 B llm_load_print_meta: model size = 65.16 GiB (16.00 BPW) llm_load_print_meta: general.name = aya-23-35B llm_load_print_meta: BOS token = 5 '' llm_load_print_meta: EOS token = 255001 '<|END_OF_TURN_TOKEN|>' llm_load_print_meta: PAD token = 0 '' llm_load_print_meta: LF token = 136 'Ä' ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes ggml_cuda_init: found 1 CUDA devices: Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes llm_load_tensors: ggml ctx size = 0.34 MiB llm_load_tensors: offloading 10 repeating layers to GPU llm_load_tensors: offloaded 10/41 layers to GPU llm_load_tensors: CPU buffer size = 66721.28 MiB llm_load_tensors: CUDA0 buffer size = 15680.31 MiB ........................................................................................... llama_new_context_with_model: n_ctx = 512 llama_new_context_with_model: n_batch = 512 llama_new_context_with_model: n_ubatch = 512 llama_new_context_with_model: flash_attn = 0 llama_new_context_with_model: freq_base = 8000000.0 llama_new_context_with_model: freq_scale = 1 llama_kv_cache_init: CUDA_Host KV buffer size = 480.00 MiB llama_kv_cache_init: CUDA0 KV buffer size = 160.00 MiB llama_new_context_with_model: KV self size = 640.00 MiB, K (f16): 320.00 MiB, V (f16): 320.00 MiB llama_new_context_with_model: CUDA_Host output buffer size = 0.98 MiB llama_new_context_with_model: CUDA0 compute buffer size = 4516.00 MiB llama_new_context_with_model: CUDA_Host compute buffer size = 33.01 MiB llama_new_context_with_model: graph nodes = 1208 llama_new_context_with_model: graph splits = 304 system_info: n_threads = 25 / 32 | AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | compute_imatrix: tokenizing the input .. compute_imatrix: tokenization took 195.016 ms compute_imatrix: computing over 194 chunks with batch_size 512 compute_imatrix: 3.91 seconds per pass - ETA 12.63 minutes [1]5.7654,[2]4.0556,[3]3.7932,[4]4.1756,[5]4.1190,[6]3.8734,[7]4.6046,[8]4.8725,[9]5.4760, save_imatrix: stored collected data after 10 chunks in aya-23-35B-IMat-GGUF/imatrix.dat [10]5.7366,[11]5.8940,[12]6.0375,[13]6.4213,[14]6.6150,[15]6.9195,[16]7.0883,[17]7.2951,[18]7.5592,[19]7.6438, save_imatrix: stored collected data after 20 chunks in aya-23-35B-IMat-GGUF/imatrix.dat [20]7.2758,[21]7.0995,[22]6.9637,[23]6.6133,[24]6.3833,[25]6.3229,[26]6.4615,[27]6.3775,[28]6.5183,[29]6.3846, save_imatrix: stored collected data after 30 chunks in aya-23-35B-IMat-GGUF/imatrix.dat [30]6.3824,[31]6.1050,[32]5.9291,[33]5.8513,[34]5.8304,[35]5.8026,[36]5.8335,[37]5.9018,[38]5.9573,[39]6.0502, save_imatrix: stored collected data after 40 chunks in aya-23-35B-IMat-GGUF/imatrix.dat [40]6.1367,[41]6.2154,[42]6.4067,[43]6.6069,[44]6.8114,[45]6.9215,[46]6.8986,[47]6.8746,[48]6.8201,[49]6.9057, save_imatrix: stored collected data after 50 chunks in aya-23-35B-IMat-GGUF/imatrix.dat [50]6.9739,[51]7.0457,[52]7.1585,[53]7.2114,[54]7.2555,[55]7.3090,[56]7.3144,[57]7.3257,[58]7.3349,[59]7.3297, save_imatrix: stored collected data after 60 chunks in aya-23-35B-IMat-GGUF/imatrix.dat [60]7.4142,[61]7.4819,[62]7.5189,[63]7.5494,[64]7.4811,[65]7.4235,[66]7.3807,[67]7.3641,[68]7.3352,[69]7.2930, save_imatrix: stored collected data after 70 chunks in aya-23-35B-IMat-GGUF/imatrix.dat [70]7.2121,[71]7.2013,[72]7.1772,[73]7.1835,[74]7.2057,[75]7.2189,[76]7.2290,[77]7.2053,[78]7.1416,[79]7.0566, save_imatrix: stored collected data after 80 chunks in aya-23-35B-IMat-GGUF/imatrix.dat [80]7.0084,[81]6.9395,[82]6.8934,[83]6.8255,[84]6.7942,[85]6.7825,[86]6.7648,[87]6.7581,[88]6.7758,[89]6.7894, save_imatrix: stored collected data after 90 chunks in aya-23-35B-IMat-GGUF/imatrix.dat [90]6.8124,[91]6.7853,[92]6.7439,[93]6.7325,[94]6.7536,[95]6.7447,[96]6.7446,[97]6.7477,[98]6.7737,[99]6.7457, save_imatrix: stored collected data after 100 chunks in aya-23-35B-IMat-GGUF/imatrix.dat [100]6.7739,[101]6.7715,[102]6.7508,[103]6.7648,[104]6.7474,[105]6.7189,[106]6.6777,[107]6.7072,[108]6.7467,[109]6.7360, save_imatrix: stored collected data after 110 chunks in aya-23-35B-IMat-GGUF/imatrix.dat [110]6.7270,[111]6.7261,[112]6.7714,[113]6.7166,[114]6.7001,[115]6.6775,[116]6.6332,[117]6.6115,[118]6.5818,[119]6.5468, save_imatrix: stored collected data after 120 chunks in aya-23-35B-IMat-GGUF/imatrix.dat [120]6.5167,[121]6.4764,[122]6.4525,[123]6.4167,[124]6.3857,[125]6.3666,[126]6.3854,[127]6.4183,[128]6.4480,[129]6.4659, save_imatrix: stored collected data after 130 chunks in aya-23-35B-IMat-GGUF/imatrix.dat [130]6.4972,[131]6.5829,[132]6.6624,[133]6.7438,[134]6.8327,[135]6.8791,[136]6.9207,[137]6.9391,[138]6.9684,[139]6.9847, save_imatrix: stored collected data after 140 chunks in aya-23-35B-IMat-GGUF/imatrix.dat [140]7.0060,[141]7.0407,[142]7.0672,[143]7.0978,[144]7.1228,[145]7.1438,[146]7.1297,[147]7.1694,[148]7.1815,[149]7.2047, save_imatrix: stored collected data after 150 chunks in aya-23-35B-IMat-GGUF/imatrix.dat [150]7.1886,[151]7.2105,[152]7.2040,[153]7.1868,[154]7.1736,[155]7.1720,[156]7.1733,[157]7.1775,[158]7.1735,[159]7.1426, save_imatrix: stored collected data after 160 chunks in aya-23-35B-IMat-GGUF/imatrix.dat [160]7.1877,[161]7.2294,[162]7.2675,[163]7.3431,[164]7.3812,[165]7.3903,[166]7.3919,[167]7.4201,[168]7.4045,[169]7.4347, save_imatrix: stored collected data after 170 chunks in aya-23-35B-IMat-GGUF/imatrix.dat [170]7.4195,[171]7.4098,[172]7.4208,[173]7.4430,[174]7.4460,[175]7.4539,[176]7.4689,[177]7.4688,[178]7.4555,[179]7.4409, save_imatrix: stored collected data after 180 chunks in aya-23-35B-IMat-GGUF/imatrix.dat [180]7.4359,[181]7.4262,[182]7.4254,[183]7.4147,[184]7.4072,[185]7.3785,[186]7.3827,[187]7.3697,[188]7.3819,[189]7.3945, save_imatrix: stored collected data after 190 chunks in aya-23-35B-IMat-GGUF/imatrix.dat [190]7.4083,[191]7.4241,[192]7.4100,[193]7.3691,[194]7.3317, save_imatrix: stored collected data after 194 chunks in aya-23-35B-IMat-GGUF/imatrix.dat llama_print_timings: load time = 7127.77 ms llama_print_timings: sample time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) llama_print_timings: prompt eval time = 737457.76 ms / 99328 tokens ( 7.42 ms per token, 134.69 tokens per second) llama_print_timings: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) llama_print_timings: total time = 744301.62 ms / 99329 tokens Final estimate: PPL = 7.3317 +/- 0.08422