{"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyOgngB7a+G/dFQHszPkGRIV"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python","version":"3.10.14","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[],"dockerImageVersionId":30761,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"!pip -q install huggingface_hub\n!pip -q install transformers sentencepiece\nfrom huggingface_hub import upload_file, create_repo\n\nfrom kaggle_secrets import UserSecretsClient\nuser_secrets = UserSecretsClient()\nHF_TOKEN = user_secrets.get_secret(\"HF_TOKEN\")","metadata":{"id":"oCMMoIiOHXdL","executionInfo":{"status":"ok","timestamp":1726501025234,"user_tz":-60,"elapsed":17496,"user":{"displayName":"Lyte","userId":"00368277356076556155"}},"execution":{"iopub.status.busy":"2024-09-16T15:43:11.615446Z","iopub.execute_input":"2024-09-16T15:43:11.616891Z","iopub.status.idle":"2024-09-16T15:43:44.237618Z","shell.execute_reply.started":"2024-09-16T15:43:11.616834Z","shell.execute_reply":"2024-09-16T15:43:44.236028Z"},"trusted":true},"execution_count":2,"outputs":[]},{"cell_type":"code","source":"!git clone https://github.com/ggerganov/llama.cpp\n%cd llama.cpp\n!make","metadata":{"id":"SO0QBqij-kSj","colab":{"base_uri":"https://localhost:8080/"},"outputId":"59059c96-5118-4278-ad0c-52390b135f10","scrolled":true,"execution":{"iopub.status.busy":"2024-09-16T15:43:44.241022Z","iopub.execute_input":"2024-09-16T15:43:44.241624Z","iopub.status.idle":"2024-09-16T15:54:35.487734Z","shell.execute_reply.started":"2024-09-16T15:43:44.241559Z","shell.execute_reply":"2024-09-16T15:54:35.485038Z"},"trusted":true},"execution_count":3,"outputs":[{"name":"stdout","text":"Cloning into 'llama.cpp'...\nremote: Enumerating objects: 34149, done.\u001b[K\nremote: Counting objects: 100% (7801/7801), done.\u001b[K\nremote: Compressing objects: 100% (691/691), done.\u001b[K\nremote: Total 34149 (delta 7499), reused 7183 (delta 7105), pack-reused 26348 (from 1)\u001b[K\nReceiving objects: 100% (34149/34149), 57.58 MiB | 22.22 MiB/s, done.\nResolving deltas: 100% (24732/24732), done.\n/kaggle/working/llama.cpp\nI ccache not found. Consider installing it for faster compilation.\nI llama.cpp build info: \nI UNAME_S: Linux\nI UNAME_P: x86_64\nI UNAME_M: x86_64\nI CFLAGS: -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -std=c11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion \nI CXXFLAGS: -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE \nI NVCCFLAGS: -std=c++11 -O3 -g \nI LDFLAGS: \nI CC: cc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nI CXX: c++ (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\n\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c ggml/src/llamafile/sgemm.cpp -o ggml/src/llamafile/sgemm.o\ncc -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -std=c11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion -c ggml/src/ggml.c -o ggml/src/ggml.o\ncc -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -std=c11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion -c ggml/src/ggml-alloc.c -o ggml/src/ggml-alloc.o\ncc -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -std=c11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion -c ggml/src/ggml-backend.c -o ggml/src/ggml-backend.o\ncc -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -std=c11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion -c ggml/src/ggml-quants.c -o ggml/src/ggml-quants.o\ncc -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -std=c11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion -c ggml/src/ggml-aarch64.c -o ggml/src/ggml-aarch64.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c src/llama.cpp -o src/llama.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c src/llama-vocab.cpp -o src/llama-vocab.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c src/llama-grammar.cpp -o src/llama-grammar.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c src/llama-sampling.cpp -o src/llama-sampling.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c src/unicode.cpp -o src/unicode.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c src/unicode-data.cpp -o src/unicode-data.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c common/common.cpp -o common/common.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c common/arg.cpp -o common/arg.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c common/log.cpp -o common/log.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c common/console.cpp -o common/console.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c common/ngram-cache.cpp -o common/ngram-cache.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c common/sampling.cpp -o common/sampling.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c common/train.cpp -o common/train.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c common/build-info.cpp -o common/build-info.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c common/json-schema-to-grammar.cpp -o common/json-schema-to-grammar.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -static -fPIC -c examples/llava/llava.cpp -o libllava.a -Wno-cast-qual\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/baby-llama/baby-llama.cpp -o examples/baby-llama/baby-llama.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/baby-llama/baby-llama.o -o llama-baby-llama \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/batched/batched.cpp -o examples/batched/batched.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/batched/batched.o -o llama-batched \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/batched-bench/batched-bench.cpp -o examples/batched-bench/batched-bench.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/batched-bench/batched-bench.o -o llama-batched-bench \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/llama-bench/llama-bench.cpp -o examples/llama-bench/llama-bench.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/llama-bench/llama-bench.o -o llama-bench \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/benchmark/benchmark-matmult.cpp -o examples/benchmark/benchmark-matmult.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o common/build-info.o examples/benchmark/benchmark-matmult.o -o llama-benchmark-matmult \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/main/main.cpp -o examples/main/main.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/main/main.o -o llama-cli \n\n==== Run ./llama-cli -h for help. ====\n\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp -o examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.o -o llama-convert-llama2c-to-ggml \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/embedding/embedding.cpp -o examples/embedding/embedding.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/embedding/embedding.o -o llama-embedding \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/eval-callback/eval-callback.cpp -o examples/eval-callback/eval-callback.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/eval-callback/eval-callback.o -o llama-eval-callback \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/export-lora/export-lora.cpp -o examples/export-lora/export-lora.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/export-lora/export-lora.o -o llama-export-lora \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/gbnf-validator/gbnf-validator.cpp -o examples/gbnf-validator/gbnf-validator.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/gbnf-validator/gbnf-validator.o -o llama-gbnf-validator \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/gguf/gguf.cpp -o examples/gguf/gguf.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o examples/gguf/gguf.o -o llama-gguf \ncc -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -std=c11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion -Iexamples/gguf-hash/deps -c examples/gguf-hash/deps/sha1/sha1.c -o examples/gguf-hash/deps/sha1/sha1.o\ncc -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -std=c11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion -Iexamples/gguf-hash/deps -c examples/gguf-hash/deps/xxhash/xxhash.c -o examples/gguf-hash/deps/xxhash/xxhash.o\ncc -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -std=c11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion -Iexamples/gguf-hash/deps -c examples/gguf-hash/deps/sha256/sha256.c -o examples/gguf-hash/deps/sha256/sha256.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -Iexamples/gguf-hash/deps -c examples/gguf-hash/gguf-hash.cpp -o examples/gguf-hash/gguf-hash.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE examples/gguf-hash/deps/sha1/sha1.o examples/gguf-hash/deps/xxhash/xxhash.o examples/gguf-hash/deps/sha256/sha256.o ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/gguf-hash/gguf-hash.o -o llama-gguf-hash \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/gguf-split/gguf-split.cpp -o examples/gguf-split/gguf-split.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/gguf-split/gguf-split.o -o llama-gguf-split \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/gritlm/gritlm.cpp -o examples/gritlm/gritlm.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/gritlm/gritlm.o -o llama-gritlm \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/imatrix/imatrix.cpp -o examples/imatrix/imatrix.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/imatrix/imatrix.o -o llama-imatrix \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/infill/infill.cpp -o examples/infill/infill.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/infill/infill.o -o llama-infill \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE examples/llava/llava-cli.cpp examples/llava/llava.cpp examples/llava/clip.cpp ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o -o llama-llava-cli -Wno-cast-qual\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE examples/llava/minicpmv-cli.cpp examples/llava/llava.cpp examples/llava/clip.cpp ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o -o llama-minicpmv-cli -Wno-cast-qual\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/lookahead/lookahead.cpp -o examples/lookahead/lookahead.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/lookahead/lookahead.o -o llama-lookahead \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/lookup/lookup.cpp -o examples/lookup/lookup.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/lookup/lookup.o -o llama-lookup \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/lookup/lookup-create.cpp -o examples/lookup/lookup-create.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/lookup/lookup-create.o -o llama-lookup-create \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/lookup/lookup-merge.cpp -o examples/lookup/lookup-merge.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/lookup/lookup-merge.o -o llama-lookup-merge \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/lookup/lookup-stats.cpp -o examples/lookup/lookup-stats.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/lookup/lookup-stats.o -o llama-lookup-stats \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/parallel/parallel.cpp -o examples/parallel/parallel.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/parallel/parallel.o -o llama-parallel \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/passkey/passkey.cpp -o examples/passkey/passkey.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/passkey/passkey.o -o llama-passkey \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/perplexity/perplexity.cpp -o examples/perplexity/perplexity.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/perplexity/perplexity.o -o llama-perplexity \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c pocs/vdot/q8dot.cpp -o pocs/vdot/q8dot.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/ggml.o ggml/src/llamafile/sgemm.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o pocs/vdot/q8dot.o -o llama-q8dot \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/quantize/quantize.cpp -o examples/quantize/quantize.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/quantize/quantize.o -o llama-quantize \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/quantize-stats/quantize-stats.cpp -o examples/quantize-stats/quantize-stats.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/quantize-stats/quantize-stats.o -o llama-quantize-stats \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/retrieval/retrieval.cpp -o examples/retrieval/retrieval.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/retrieval/retrieval.o -o llama-retrieval \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/save-load-state/save-load-state.cpp -o examples/save-load-state/save-load-state.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/save-load-state/save-load-state.o -o llama-save-load-state \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/server/server.cpp -o examples/server/server.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o -Iexamples/server examples/server/server.o -o llama-server \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/simple/simple.cpp -o examples/simple/simple.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/simple/simple.o -o llama-simple \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/speculative/speculative.cpp -o examples/speculative/speculative.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/speculative/speculative.o -o llama-speculative \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/tokenize/tokenize.cpp -o examples/tokenize/tokenize.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/tokenize/tokenize.o -o llama-tokenize \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c pocs/vdot/vdot.cpp -o pocs/vdot/vdot.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/ggml.o ggml/src/llamafile/sgemm.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o pocs/vdot/vdot.o -o llama-vdot \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/cvector-generator/cvector-generator.cpp -o examples/cvector-generator/cvector-generator.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/cvector-generator/cvector-generator.o -o llama-cvector-generator \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/gen-docs/gen-docs.cpp -o examples/gen-docs/gen-docs.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/gen-docs/gen-docs.o -o llama-gen-docs \ncc -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -std=c11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion -c tests/test-c.c -o tests/test-c.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE -c examples/deprecation-warning/deprecation-warning.cpp -o examples/deprecation-warning/deprecation-warning.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE examples/deprecation-warning/deprecation-warning.o -o main \nNOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead.\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE examples/deprecation-warning/deprecation-warning.o -o server \nNOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead.\n","output_type":"stream"}]},{"cell_type":"code","source":"!mkdir model/\n%cd model/\n!wget https://huggingface.co/RWKV/rwkv-6-world-1b6/resolve/main/config.json\n!wget https://huggingface.co/RWKV/rwkv-6-world-1b6/resolve/main/generation_config.json\n!wget https://huggingface.co/RWKV/rwkv-6-world-1b6/resolve/main/added_tokens.json\n!wget https://huggingface.co/RWKV/rwkv-6-world-1b6/resolve/main/special_tokens_map.json\n!wget https://huggingface.co/RWKV/rwkv-6-world-1b6/resolve/main/rwkv_vocab_v20230424.txt\n!wget https://huggingface.co/RWKV/rwkv-6-world-1b6/resolve/main/pytorch_model.bin\n!wget https://huggingface.co/RWKV/rwkv-6-world-1b6/resolve/main/tokenizer_config.json\n!ls\n%cd ..","metadata":{"id":"k-vJrh2cDZmk","scrolled":true,"execution":{"iopub.status.busy":"2024-09-16T16:10:40.690242Z","iopub.execute_input":"2024-09-16T16:10:40.690904Z","iopub.status.idle":"2024-09-16T16:12:31.318947Z","shell.execute_reply.started":"2024-09-16T16:10:40.690849Z","shell.execute_reply":"2024-09-16T16:12:31.317019Z"},"trusted":true},"execution_count":18,"outputs":[{"name":"stdout","text":"mkdir: cannot create directory 'model/': File exists\n/kaggle/working/llama.cpp/model\n--2024-09-16 16:10:43-- https://huggingface.co/RWKV/rwkv-6-world-1b6/resolve/main/config.json\nResolving huggingface.co (huggingface.co)... 3.165.160.59, 3.165.160.12, 3.165.160.11, ...\nConnecting to huggingface.co (huggingface.co)|3.165.160.59|:443... connected.\nHTTP request sent, awaiting response... 200 OK\nLength: 610 [text/plain]\nSaving to: 'config.json'\n\nconfig.json 100%[===================>] 610 --.-KB/s in 0s \n\n2024-09-16 16:10:43 (58.0 MB/s) - 'config.json' saved [610/610]\n\n--2024-09-16 16:10:44-- https://huggingface.co/RWKV/rwkv-6-world-1b6/resolve/main/generation_config.json\nResolving huggingface.co (huggingface.co)... 3.165.160.11, 3.165.160.61, 3.165.160.12, ...\nConnecting to huggingface.co (huggingface.co)|3.165.160.11|:443... connected.\nHTTP request sent, awaiting response... 200 OK\nLength: 260 [text/plain]\nSaving to: 'generation_config.json'\n\ngeneration_config.j 100%[===================>] 260 --.-KB/s in 0s \n\n2024-09-16 16:10:44 (22.7 MB/s) - 'generation_config.json' saved [260/260]\n\n--2024-09-16 16:10:45-- https://huggingface.co/RWKV/rwkv-6-world-1b6/resolve/main/added_tokens.json\nResolving huggingface.co (huggingface.co)... 3.165.160.59, 3.165.160.11, 3.165.160.12, ...\nConnecting to huggingface.co (huggingface.co)|3.165.160.59|:443... connected.\nHTTP request sent, awaiting response... 200 OK\nLength: 15 [text/plain]\nSaving to: 'added_tokens.json'\n\nadded_tokens.json 100%[===================>] 15 --.-KB/s in 0s \n\n2024-09-16 16:10:45 (1.24 MB/s) - 'added_tokens.json' saved [15/15]\n\n--2024-09-16 16:10:47-- https://huggingface.co/RWKV/rwkv-6-world-1b6/resolve/main/special_tokens_map.json\nResolving huggingface.co (huggingface.co)... 3.165.160.61, 3.165.160.59, 3.165.160.11, ...\nConnecting to huggingface.co (huggingface.co)|3.165.160.61|:443... connected.\nHTTP request sent, awaiting response... 200 OK\nLength: 91 [text/plain]\nSaving to: 'special_tokens_map.json'\n\nspecial_tokens_map. 100%[===================>] 91 --.-KB/s in 0s \n\n2024-09-16 16:10:47 (4.87 MB/s) - 'special_tokens_map.json' saved [91/91]\n\n--2024-09-16 16:10:48-- https://huggingface.co/RWKV/rwkv-6-world-1b6/resolve/main/rwkv_vocab_v20230424.txt\nResolving huggingface.co (huggingface.co)... 3.165.160.12, 3.165.160.11, 3.165.160.59, ...\nConnecting to huggingface.co (huggingface.co)|3.165.160.12|:443... connected.\nHTTP request sent, awaiting response... 200 OK\nLength: 1093733 (1.0M) [text/plain]\nSaving to: 'rwkv_vocab_v20230424.txt'\n\nrwkv_vocab_v2023042 100%[===================>] 1.04M 5.37MB/s in 0.2s \n\n2024-09-16 16:10:48 (5.37 MB/s) - 'rwkv_vocab_v20230424.txt' saved [1093733/1093733]\n\n--2024-09-16 16:10:49-- https://huggingface.co/RWKV/rwkv-6-world-1b6/resolve/main/pytorch_model.bin\nResolving huggingface.co (huggingface.co)... 3.165.160.61, 3.165.160.11, 3.165.160.59, ...\nConnecting to huggingface.co (huggingface.co)|3.165.160.61|:443... connected.\nHTTP request sent, awaiting response... 302 Found\nLocation: https://cdn-lfs-us-1.huggingface.co/repos/f6/ff/f6ff4cd2e55f87480652292741ef4ea949af295cd3fee9e7279db09d3ad866d0/609ffca33ff73d53bf059f7336396dc39bfe76764d3b263429ee5f2933688993?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1726762250&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyNjc2MjI1MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2Y2L2ZmL2Y2ZmY0Y2QyZTU1Zjg3NDgwNjUyMjkyNzQxZWY0ZWE5NDlhZjI5NWNkM2ZlZTllNzI3OWRiMDlkM2FkODY2ZDAvNjA5ZmZjYTMzZmY3M2Q1M2JmMDU5ZjczMzYzOTZkYzM5YmZlNzY3NjRkM2IyNjM0MjllZTVmMjkzMzY4ODk5Mz9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=OpzRJc69pyjGt9FE7yAP74u0dyWRqQcXYY4pPEmgPyMJj7m6mtLd1rddbB3CruHOu9H%7ELicWaivltQV8EfEeyAZt4KdgMxjfeBvkrQNFkoEb-k6d-qCKbpzVGPZijtnLUMUtVv5LW76393LAtJED47McOKbevcZIUonFPbAyTDv0n3p6i8YItRUAwUo1LS-EJWPNZ3EROxGRANIowpsdGZSRPiu1nBqfuRdpvz9avdarzdO9vNkJnDjPn%7EW57vyDL0VWGSRQ1tfIfoTFok9Ta-ny3cPN779Kj3eZOXRLoqQd0q7D5VGvuLMApccga6IcNClgiV3hcNf-r2xahz9A5w__&Key-Pair-Id=K24J24Z295AEI9 [following]\n--2024-09-16 16:10:50-- https://cdn-lfs-us-1.huggingface.co/repos/f6/ff/f6ff4cd2e55f87480652292741ef4ea949af295cd3fee9e7279db09d3ad866d0/609ffca33ff73d53bf059f7336396dc39bfe76764d3b263429ee5f2933688993?response-content-disposition=inline%3B+filename*%3DUTF-8''pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1726762250&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyNjc2MjI1MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2Y2L2ZmL2Y2ZmY0Y2QyZTU1Zjg3NDgwNjUyMjkyNzQxZWY0ZWE5NDlhZjI5NWNkM2ZlZTllNzI3OWRiMDlkM2FkODY2ZDAvNjA5ZmZjYTMzZmY3M2Q1M2JmMDU5ZjczMzYzOTZkYzM5YmZlNzY3NjRkM2IyNjM0MjllZTVmMjkzMzY4ODk5Mz9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=OpzRJc69pyjGt9FE7yAP74u0dyWRqQcXYY4pPEmgPyMJj7m6mtLd1rddbB3CruHOu9H~LicWaivltQV8EfEeyAZt4KdgMxjfeBvkrQNFkoEb-k6d-qCKbpzVGPZijtnLUMUtVv5LW76393LAtJED47McOKbevcZIUonFPbAyTDv0n3p6i8YItRUAwUo1LS-EJWPNZ3EROxGRANIowpsdGZSRPiu1nBqfuRdpvz9avdarzdO9vNkJnDjPn~W57vyDL0VWGSRQ1tfIfoTFok9Ta-ny3cPN779Kj3eZOXRLoqQd0q7D5VGvuLMApccga6IcNClgiV3hcNf-r2xahz9A5w__&Key-Pair-Id=K24J24Z295AEI9\nResolving cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)... 3.163.189.28, 3.163.189.20, 3.163.189.91, ...\nConnecting to cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)|3.163.189.28|:443... connected.\nHTTP request sent, awaiting response... 200 OK\nLength: 3199826561 (3.0G) [application/octet-stream]\nSaving to: 'pytorch_model.bin'\n\npytorch_model.bin 100%[===================>] 2.98G 32.2MB/s in 98s \n\n2024-09-16 16:12:28 (31.1 MB/s) - 'pytorch_model.bin' saved [3199826561/3199826561]\n\n--2024-09-16 16:12:29-- https://huggingface.co/RWKV/rwkv-6-world-1b6/resolve/main/tokenizer_config.json\nResolving huggingface.co (huggingface.co)... 3.165.160.12, 3.165.160.11, 3.165.160.59, ...\nConnecting to huggingface.co (huggingface.co)|3.165.160.12|:443... connected.\nHTTP request sent, awaiting response... 200 OK\nLength: 234 [text/plain]\nSaving to: 'tokenizer_config.json'\n\ntokenizer_config.js 100%[===================>] 234 --.-KB/s in 0s \n\n2024-09-16 16:12:29 (18.6 MB/s) - 'tokenizer_config.json' saved [234/234]\n\nadded_tokens.json\tpytorch_model.bin\t tokenizer_config.json\nconfig.json\t\trwkv_vocab_v20230424.txt\ngeneration_config.json\tspecial_tokens_map.json\n/kaggle/working/llama.cpp\n","output_type":"stream"}]},{"cell_type":"code","source":"!python convert_hf_to_gguf.py model/","metadata":{"id":"zHtxjcITBSzN","execution":{"iopub.status.busy":"2024-09-16T16:12:31.321369Z","iopub.execute_input":"2024-09-16T16:12:31.321900Z","iopub.status.idle":"2024-09-16T16:12:58.901563Z","shell.execute_reply.started":"2024-09-16T16:12:31.321840Z","shell.execute_reply":"2024-09-16T16:12:58.899537Z"},"trusted":true},"execution_count":19,"outputs":[{"name":"stdout","text":"Writing: 100%|███████████████████████████| 3.25G/3.25G [00:19<00:00, 168Mbyte/s]\n","output_type":"stream"}]},{"cell_type":"code","source":"!ls","metadata":{"id":"VTibFqk_dJAG","execution":{"iopub.status.busy":"2024-09-16T15:58:07.359854Z","iopub.execute_input":"2024-09-16T15:58:07.360380Z","iopub.status.idle":"2024-09-16T15:58:08.541518Z","shell.execute_reply.started":"2024-09-16T15:58:07.360325Z","shell.execute_reply":"2024-09-16T15:58:08.539947Z"},"trusted":true},"execution_count":6,"outputs":[{"name":"stdout","text":"AUTHORS\t\t\t llama-gritlm\nCMakeLists.txt\t\t llama-imatrix\nCMakePresets.json\t llama-infill\nCONTRIBUTING.md\t\t llama-llava-cli\nLICENSE\t\t\t llama-lookahead\nMakefile\t\t llama-lookup\nPackage.swift\t\t llama-lookup-create\nREADME.md\t\t llama-lookup-merge\nSECURITY.md\t\t llama-lookup-stats\nci\t\t\t llama-minicpmv-cli\ncmake\t\t\t llama-parallel\ncommon\t\t\t llama-passkey\nconvert_hf_to_gguf.py\t llama-perplexity\nconvert_hf_to_gguf_update.py llama-q8dot\nconvert_llama_ggml_to_gguf.py llama-quantize\nconvert_lora_to_gguf.py llama-quantize-stats\ndocs\t\t\t llama-retrieval\nexamples\t\t llama-save-load-state\nflake.lock\t\t llama-server\nflake.nix\t\t llama-simple\nggml\t\t\t llama-speculative\ngguf-py\t\t\t llama-tokenize\ngrammars\t\t llama-vdot\ninclude\t\t\t main\nlibllava.a\t\t media\nllama-baby-llama\t model\nllama-batched\t\t models\nllama-batched-bench\t mypy.ini\nllama-bench\t\t pocs\nllama-benchmark-matmult poetry.lock\nllama-cli\t\t prompts\nllama-convert-llama2c-to-ggml pyproject.toml\nllama-cvector-generator pyrightconfig.json\nllama-embedding\t\t requirements\nllama-eval-callback\t requirements.txt\nllama-export-lora\t scripts\nllama-gbnf-validator\t server\nllama-gen-docs\t\t spm-headers\nllama-gguf\t\t src\nllama-gguf-hash\t\t tests\nllama-gguf-split\n","output_type":"stream"}]},{"cell_type":"code","source":"!./llama-quantize ./model/Model-1.6B-F16.gguf ./model/RWKV-6-World-1.6B-GGUF-Q2_K.gguf Q2_K","metadata":{"id":"bgGOaSYxAO_K","scrolled":true,"execution":{"iopub.status.busy":"2024-09-16T16:25:01.963314Z","iopub.execute_input":"2024-09-16T16:25:01.963850Z","iopub.status.idle":"2024-09-16T16:26:43.499980Z","shell.execute_reply.started":"2024-09-16T16:25:01.963797Z","shell.execute_reply":"2024-09-16T16:26:43.497981Z"},"trusted":true},"execution_count":37,"outputs":[{"name":"stdout","text":"main: build = 3772 (23e0d70b)\nmain: built with cc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0 for x86_64-linux-gnu\nmain: quantizing './model/Model-1.6B-F16.gguf' to './model/RWKV-6-World-1.6B-GGUF-Q2_K.gguf' as Q2_K\nllama_model_loader: loaded meta data with 21 key-value pairs and 678 tensors from ./model/Model-1.6B-F16.gguf (version GGUF V3 (latest))\nllama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\nllama_model_loader: - kv 0: general.architecture str = rwkv6\nllama_model_loader: - kv 1: general.type str = model\nllama_model_loader: - kv 2: general.name str = Model\nllama_model_loader: - kv 3: general.size_label str = 1.6B\nllama_model_loader: - kv 4: rwkv6.context_length u32 = 1048576\nllama_model_loader: - kv 5: rwkv6.embedding_length u32 = 2048\nllama_model_loader: - kv 6: rwkv6.block_count u32 = 24\nllama_model_loader: - kv 7: rwkv6.attention.layer_norm_epsilon f32 = 0.000010\nllama_model_loader: - kv 8: rwkv6.rescale_every_n_layers u32 = 6\nllama_model_loader: - kv 9: rwkv6.wkv.head_size u32 = 64\nllama_model_loader: - kv 10: rwkv6.time_mix_extra_dim u32 = 32\nllama_model_loader: - kv 11: rwkv6.time_decay_extra_dim u32 = 64\nllama_model_loader: - kv 12: rwkv6.feed_forward_length u32 = 7168\nllama_model_loader: - kv 13: general.file_type u32 = 1\nllama_model_loader: - kv 14: rwkv6.attention.head_count u32 = 0\nllama_model_loader: - kv 15: tokenizer.ggml.model str = rwkv\nllama_model_loader: - kv 16: tokenizer.ggml.tokens arr[str,65536] = [\"\", \"\\\\x00\", \"\\\\x01\", \"\\\\x02\", \"\\...\nllama_model_loader: - kv 17: tokenizer.ggml.token_type arr[i32,65536] = [3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...\nllama_model_loader: - kv 18: tokenizer.ggml.bos_token_id u32 = 0\nllama_model_loader: - kv 19: tokenizer.ggml.eos_token_id u32 = 0\nllama_model_loader: - kv 20: general.quantization_version u32 = 2\nllama_model_loader: - type f32: 484 tensors\nllama_model_loader: - type f16: 194 tensors\n[ 1/ 678] token_embd.weight - [ 2048, 65536, 1, 1], type = f16, converting to q2_K .. size = 256.00 MiB -> 42.00 MiB\n[ 2/ 678] blk.0.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 3/ 678] blk.0.attn_norm.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 4/ 678] blk.0.attn_norm_2.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 5/ 678] blk.0.attn_norm_2.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 6/ 678] token_embd_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 7/ 678] token_embd_norm.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 8/ 678] blk.0.time_mix_lerp_x.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 9/ 678] blk.0.time_mix_lerp_w.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 10/ 678] blk.0.time_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 11/ 678] blk.0.time_mix_lerp_v.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 12/ 678] blk.0.time_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 13/ 678] blk.0.time_mix_lerp_g.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 14/ 678] blk.0.time_mix_w1.weight - [ 2048, 160, 1, 1], type = f32, size = 1.250 MB\n[ 15/ 678] blk.0.time_mix_w2.weight - [ 32, 2048, 5, 1], type = f32, size = 1.250 MB\n[ 16/ 678] blk.0.time_mix_decay.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 17/ 678] blk.0.time_mix_decay_w1.weight - [ 2048, 64, 1, 1], type = f32, size = 0.500 MB\n[ 18/ 678] blk.0.time_mix_decay_w2.weight - [ 64, 2048, 1, 1], type = f32, size = 0.500 MB\n[ 19/ 678] blk.0.time_mix_first.weight - [ 64, 32, 1, 1], type = f32, size = 0.008 MB\n[ 20/ 678] blk.0.time_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 21/ 678] blk.0.time_mix_key.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 22/ 678] blk.0.time_mix_value.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 23/ 678] blk.0.time_mix_output.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 24/ 678] blk.0.time_mix_gate.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 25/ 678] blk.0.time_mix_ln.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 26/ 678] blk.0.time_mix_ln.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 27/ 678] blk.0.channel_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 28/ 678] blk.0.channel_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 29/ 678] blk.0.channel_mix_key.weight - [ 2048, 7168, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 30/ 678] blk.0.channel_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 31/ 678] blk.0.channel_mix_value.weight - [ 7168, 2048, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 32/ 678] blk.1.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 33/ 678] blk.1.attn_norm.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 34/ 678] blk.1.attn_norm_2.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 35/ 678] blk.1.attn_norm_2.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 36/ 678] blk.1.time_mix_lerp_x.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 37/ 678] blk.1.time_mix_lerp_w.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 38/ 678] blk.1.time_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 39/ 678] blk.1.time_mix_lerp_v.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 40/ 678] blk.1.time_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 41/ 678] blk.1.time_mix_lerp_g.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 42/ 678] blk.1.time_mix_w1.weight - [ 2048, 160, 1, 1], type = f32, size = 1.250 MB\n[ 43/ 678] blk.1.time_mix_w2.weight - [ 32, 2048, 5, 1], type = f32, size = 1.250 MB\n[ 44/ 678] blk.1.time_mix_decay.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 45/ 678] blk.1.time_mix_decay_w1.weight - [ 2048, 64, 1, 1], type = f32, size = 0.500 MB\n[ 46/ 678] blk.1.time_mix_decay_w2.weight - [ 64, 2048, 1, 1], type = f32, size = 0.500 MB\n[ 47/ 678] blk.1.time_mix_first.weight - [ 64, 32, 1, 1], type = f32, size = 0.008 MB\n[ 48/ 678] blk.1.time_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 49/ 678] blk.1.time_mix_key.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 50/ 678] blk.1.time_mix_value.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 51/ 678] blk.1.time_mix_output.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 52/ 678] blk.1.time_mix_gate.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 53/ 678] blk.1.time_mix_ln.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 54/ 678] blk.1.time_mix_ln.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 55/ 678] blk.1.channel_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 56/ 678] blk.1.channel_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 57/ 678] blk.1.channel_mix_key.weight - [ 2048, 7168, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 58/ 678] blk.1.channel_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 59/ 678] blk.1.channel_mix_value.weight - [ 7168, 2048, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 60/ 678] blk.2.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 61/ 678] blk.2.attn_norm.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 62/ 678] blk.2.attn_norm_2.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 63/ 678] blk.2.attn_norm_2.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 64/ 678] blk.2.time_mix_lerp_x.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 65/ 678] blk.2.time_mix_lerp_w.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 66/ 678] blk.2.time_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 67/ 678] blk.2.time_mix_lerp_v.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 68/ 678] blk.2.time_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 69/ 678] blk.2.time_mix_lerp_g.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 70/ 678] blk.2.time_mix_w1.weight - [ 2048, 160, 1, 1], type = f32, size = 1.250 MB\n[ 71/ 678] blk.2.time_mix_w2.weight - [ 32, 2048, 5, 1], type = f32, size = 1.250 MB\n[ 72/ 678] blk.2.time_mix_decay.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 73/ 678] blk.2.time_mix_decay_w1.weight - [ 2048, 64, 1, 1], type = f32, size = 0.500 MB\n[ 74/ 678] blk.2.time_mix_decay_w2.weight - [ 64, 2048, 1, 1], type = f32, size = 0.500 MB\n[ 75/ 678] blk.2.time_mix_first.weight - [ 64, 32, 1, 1], type = f32, size = 0.008 MB\n[ 76/ 678] blk.2.time_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 77/ 678] blk.2.time_mix_key.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 78/ 678] blk.2.time_mix_value.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 79/ 678] blk.2.time_mix_output.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 80/ 678] blk.2.time_mix_gate.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 81/ 678] blk.2.time_mix_ln.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 82/ 678] blk.2.time_mix_ln.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 83/ 678] blk.2.channel_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 84/ 678] blk.2.channel_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 85/ 678] blk.2.channel_mix_key.weight - [ 2048, 7168, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 86/ 678] blk.2.channel_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 87/ 678] blk.2.channel_mix_value.weight - [ 7168, 2048, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 88/ 678] blk.3.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 89/ 678] blk.3.attn_norm.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 90/ 678] blk.3.attn_norm_2.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 91/ 678] blk.3.attn_norm_2.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 92/ 678] blk.3.time_mix_lerp_x.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 93/ 678] blk.3.time_mix_lerp_w.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 94/ 678] blk.3.time_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 95/ 678] blk.3.time_mix_lerp_v.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 96/ 678] blk.3.time_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 97/ 678] blk.3.time_mix_lerp_g.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 98/ 678] blk.3.time_mix_w1.weight - [ 2048, 160, 1, 1], type = f32, size = 1.250 MB\n[ 99/ 678] blk.3.time_mix_w2.weight - [ 32, 2048, 5, 1], type = f32, size = 1.250 MB\n[ 100/ 678] blk.3.time_mix_decay.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 101/ 678] blk.3.time_mix_decay_w1.weight - [ 2048, 64, 1, 1], type = f32, size = 0.500 MB\n[ 102/ 678] blk.3.time_mix_decay_w2.weight - [ 64, 2048, 1, 1], type = f32, size = 0.500 MB\n[ 103/ 678] blk.3.time_mix_first.weight - [ 64, 32, 1, 1], type = f32, size = 0.008 MB\n[ 104/ 678] blk.3.time_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 105/ 678] blk.3.time_mix_key.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 106/ 678] blk.3.time_mix_value.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 107/ 678] blk.3.time_mix_output.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 108/ 678] blk.3.time_mix_gate.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 109/ 678] blk.3.time_mix_ln.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 110/ 678] blk.3.time_mix_ln.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 111/ 678] blk.3.channel_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 112/ 678] blk.3.channel_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 113/ 678] blk.3.channel_mix_key.weight - [ 2048, 7168, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 114/ 678] blk.3.channel_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 115/ 678] blk.3.channel_mix_value.weight - [ 7168, 2048, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 116/ 678] blk.4.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 117/ 678] blk.4.attn_norm.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 118/ 678] blk.4.attn_norm_2.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 119/ 678] blk.4.attn_norm_2.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 120/ 678] blk.4.time_mix_lerp_x.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 121/ 678] blk.4.time_mix_lerp_w.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 122/ 678] blk.4.time_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 123/ 678] blk.4.time_mix_lerp_v.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 124/ 678] blk.4.time_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 125/ 678] blk.4.time_mix_lerp_g.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 126/ 678] blk.4.time_mix_w1.weight - [ 2048, 160, 1, 1], type = f32, size = 1.250 MB\n[ 127/ 678] blk.4.time_mix_w2.weight - [ 32, 2048, 5, 1], type = f32, size = 1.250 MB\n[ 128/ 678] blk.4.time_mix_decay.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 129/ 678] blk.4.time_mix_decay_w1.weight - [ 2048, 64, 1, 1], type = f32, size = 0.500 MB\n[ 130/ 678] blk.4.time_mix_decay_w2.weight - [ 64, 2048, 1, 1], type = f32, size = 0.500 MB\n[ 131/ 678] blk.4.time_mix_first.weight - [ 64, 32, 1, 1], type = f32, size = 0.008 MB\n[ 132/ 678] blk.4.time_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 133/ 678] blk.4.time_mix_key.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 134/ 678] blk.4.time_mix_value.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 135/ 678] blk.4.time_mix_output.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 136/ 678] blk.4.time_mix_gate.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 137/ 678] blk.4.time_mix_ln.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 138/ 678] blk.4.time_mix_ln.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 139/ 678] blk.4.channel_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 140/ 678] blk.4.channel_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 141/ 678] blk.4.channel_mix_key.weight - [ 2048, 7168, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 142/ 678] blk.4.channel_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 143/ 678] blk.4.channel_mix_value.weight - [ 7168, 2048, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 144/ 678] blk.5.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 145/ 678] blk.5.attn_norm.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 146/ 678] blk.5.attn_norm_2.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 147/ 678] blk.5.attn_norm_2.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 148/ 678] blk.5.time_mix_lerp_x.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 149/ 678] blk.5.time_mix_lerp_w.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 150/ 678] blk.5.time_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 151/ 678] blk.5.time_mix_lerp_v.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 152/ 678] blk.5.time_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 153/ 678] blk.5.time_mix_lerp_g.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 154/ 678] blk.5.time_mix_w1.weight - [ 2048, 160, 1, 1], type = f32, size = 1.250 MB\n[ 155/ 678] blk.5.time_mix_w2.weight - [ 32, 2048, 5, 1], type = f32, size = 1.250 MB\n[ 156/ 678] blk.5.time_mix_decay.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 157/ 678] blk.5.time_mix_decay_w1.weight - [ 2048, 64, 1, 1], type = f32, size = 0.500 MB\n[ 158/ 678] blk.5.time_mix_decay_w2.weight - [ 64, 2048, 1, 1], type = f32, size = 0.500 MB\n[ 159/ 678] blk.5.time_mix_first.weight - [ 64, 32, 1, 1], type = f32, size = 0.008 MB\n[ 160/ 678] blk.5.time_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 161/ 678] blk.5.time_mix_key.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 162/ 678] blk.5.time_mix_value.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 163/ 678] blk.5.time_mix_output.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 164/ 678] blk.5.time_mix_gate.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 165/ 678] blk.5.time_mix_ln.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 166/ 678] blk.5.time_mix_ln.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 167/ 678] blk.5.channel_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 168/ 678] blk.5.channel_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 169/ 678] blk.5.channel_mix_key.weight - [ 2048, 7168, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 170/ 678] blk.5.channel_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 171/ 678] blk.5.channel_mix_value.weight - [ 7168, 2048, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 172/ 678] blk.6.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 173/ 678] blk.6.attn_norm.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 174/ 678] blk.6.attn_norm_2.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 175/ 678] blk.6.attn_norm_2.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 176/ 678] blk.6.time_mix_lerp_x.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 177/ 678] blk.6.time_mix_lerp_w.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 178/ 678] blk.6.time_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 179/ 678] blk.6.time_mix_lerp_v.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 180/ 678] blk.6.time_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 181/ 678] blk.6.time_mix_lerp_g.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 182/ 678] blk.6.time_mix_w1.weight - [ 2048, 160, 1, 1], type = f32, size = 1.250 MB\n[ 183/ 678] blk.6.time_mix_w2.weight - [ 32, 2048, 5, 1], type = f32, size = 1.250 MB\n[ 184/ 678] blk.6.time_mix_decay.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 185/ 678] blk.6.time_mix_decay_w1.weight - [ 2048, 64, 1, 1], type = f32, size = 0.500 MB\n[ 186/ 678] blk.6.time_mix_decay_w2.weight - [ 64, 2048, 1, 1], type = f32, size = 0.500 MB\n[ 187/ 678] blk.6.time_mix_first.weight - [ 64, 32, 1, 1], type = f32, size = 0.008 MB\n[ 188/ 678] blk.6.time_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 189/ 678] blk.6.time_mix_key.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 190/ 678] blk.6.time_mix_value.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 191/ 678] blk.6.time_mix_output.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 192/ 678] blk.6.time_mix_gate.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 193/ 678] blk.6.time_mix_ln.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 194/ 678] blk.6.time_mix_ln.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 195/ 678] blk.6.channel_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 196/ 678] blk.6.channel_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 197/ 678] blk.6.channel_mix_key.weight - [ 2048, 7168, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 198/ 678] blk.6.channel_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 199/ 678] blk.6.channel_mix_value.weight - [ 7168, 2048, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 200/ 678] blk.7.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 201/ 678] blk.7.attn_norm.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 202/ 678] blk.7.attn_norm_2.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 203/ 678] blk.7.attn_norm_2.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 204/ 678] blk.7.time_mix_lerp_x.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 205/ 678] blk.7.time_mix_lerp_w.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 206/ 678] blk.7.time_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 207/ 678] blk.7.time_mix_lerp_v.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 208/ 678] blk.7.time_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 209/ 678] blk.7.time_mix_lerp_g.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 210/ 678] blk.7.time_mix_w1.weight - [ 2048, 160, 1, 1], type = f32, size = 1.250 MB\n[ 211/ 678] blk.7.time_mix_w2.weight - [ 32, 2048, 5, 1], type = f32, size = 1.250 MB\n[ 212/ 678] blk.7.time_mix_decay.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 213/ 678] blk.7.time_mix_decay_w1.weight - [ 2048, 64, 1, 1], type = f32, size = 0.500 MB\n[ 214/ 678] blk.7.time_mix_decay_w2.weight - [ 64, 2048, 1, 1], type = f32, size = 0.500 MB\n[ 215/ 678] blk.7.time_mix_first.weight - [ 64, 32, 1, 1], type = f32, size = 0.008 MB\n[ 216/ 678] blk.7.time_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 217/ 678] blk.7.time_mix_key.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 218/ 678] blk.7.time_mix_value.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 219/ 678] blk.7.time_mix_output.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 220/ 678] blk.7.time_mix_gate.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 221/ 678] blk.7.time_mix_ln.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 222/ 678] blk.7.time_mix_ln.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 223/ 678] blk.7.channel_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 224/ 678] blk.7.channel_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 225/ 678] blk.7.channel_mix_key.weight - [ 2048, 7168, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 226/ 678] blk.7.channel_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 227/ 678] blk.7.channel_mix_value.weight - [ 7168, 2048, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 228/ 678] blk.8.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 229/ 678] blk.8.attn_norm.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 230/ 678] blk.8.attn_norm_2.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 231/ 678] blk.8.attn_norm_2.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 232/ 678] blk.8.time_mix_lerp_x.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 233/ 678] blk.8.time_mix_lerp_w.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 234/ 678] blk.8.time_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 235/ 678] blk.8.time_mix_lerp_v.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 236/ 678] blk.8.time_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 237/ 678] blk.8.time_mix_lerp_g.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 238/ 678] blk.8.time_mix_w1.weight - [ 2048, 160, 1, 1], type = f32, size = 1.250 MB\n[ 239/ 678] blk.8.time_mix_w2.weight - [ 32, 2048, 5, 1], type = f32, size = 1.250 MB\n[ 240/ 678] blk.8.time_mix_decay.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 241/ 678] blk.8.time_mix_decay_w1.weight - [ 2048, 64, 1, 1], type = f32, size = 0.500 MB\n[ 242/ 678] blk.8.time_mix_decay_w2.weight - [ 64, 2048, 1, 1], type = f32, size = 0.500 MB\n[ 243/ 678] blk.8.time_mix_first.weight - [ 64, 32, 1, 1], type = f32, size = 0.008 MB\n[ 244/ 678] blk.8.time_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 245/ 678] blk.8.time_mix_key.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 246/ 678] blk.8.time_mix_value.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 247/ 678] blk.8.time_mix_output.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 248/ 678] blk.8.time_mix_gate.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 249/ 678] blk.8.time_mix_ln.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 250/ 678] blk.8.time_mix_ln.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 251/ 678] blk.8.channel_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 252/ 678] blk.8.channel_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 253/ 678] blk.8.channel_mix_key.weight - [ 2048, 7168, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 254/ 678] blk.8.channel_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 255/ 678] blk.8.channel_mix_value.weight - [ 7168, 2048, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 256/ 678] blk.9.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 257/ 678] blk.9.attn_norm.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 258/ 678] blk.9.attn_norm_2.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 259/ 678] blk.9.attn_norm_2.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 260/ 678] blk.9.time_mix_lerp_x.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 261/ 678] blk.9.time_mix_lerp_w.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 262/ 678] blk.9.time_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 263/ 678] blk.9.time_mix_lerp_v.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 264/ 678] blk.9.time_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 265/ 678] blk.9.time_mix_lerp_g.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 266/ 678] blk.9.time_mix_w1.weight - [ 2048, 160, 1, 1], type = f32, size = 1.250 MB\n[ 267/ 678] blk.9.time_mix_w2.weight - [ 32, 2048, 5, 1], type = f32, size = 1.250 MB\n[ 268/ 678] blk.9.time_mix_decay.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 269/ 678] blk.9.time_mix_decay_w1.weight - [ 2048, 64, 1, 1], type = f32, size = 0.500 MB\n[ 270/ 678] blk.9.time_mix_decay_w2.weight - [ 64, 2048, 1, 1], type = f32, size = 0.500 MB\n[ 271/ 678] blk.9.time_mix_first.weight - [ 64, 32, 1, 1], type = f32, size = 0.008 MB\n[ 272/ 678] blk.9.time_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 273/ 678] blk.9.time_mix_key.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 274/ 678] blk.9.time_mix_value.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 275/ 678] blk.9.time_mix_output.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 276/ 678] blk.9.time_mix_gate.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 277/ 678] blk.9.time_mix_ln.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 278/ 678] blk.9.time_mix_ln.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 279/ 678] blk.9.channel_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 280/ 678] blk.9.channel_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 281/ 678] blk.9.channel_mix_key.weight - [ 2048, 7168, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 282/ 678] blk.9.channel_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 283/ 678] blk.9.channel_mix_value.weight - [ 7168, 2048, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 284/ 678] blk.10.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 285/ 678] blk.10.attn_norm.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 286/ 678] blk.10.attn_norm_2.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 287/ 678] blk.10.attn_norm_2.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 288/ 678] blk.10.time_mix_lerp_x.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 289/ 678] blk.10.time_mix_lerp_w.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 290/ 678] blk.10.time_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 291/ 678] blk.10.time_mix_lerp_v.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 292/ 678] blk.10.time_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 293/ 678] blk.10.time_mix_lerp_g.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 294/ 678] blk.10.time_mix_w1.weight - [ 2048, 160, 1, 1], type = f32, size = 1.250 MB\n[ 295/ 678] blk.10.time_mix_w2.weight - [ 32, 2048, 5, 1], type = f32, size = 1.250 MB\n[ 296/ 678] blk.10.time_mix_decay.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 297/ 678] blk.10.time_mix_decay_w1.weight - [ 2048, 64, 1, 1], type = f32, size = 0.500 MB\n[ 298/ 678] blk.10.time_mix_decay_w2.weight - [ 64, 2048, 1, 1], type = f32, size = 0.500 MB\n[ 299/ 678] blk.10.time_mix_first.weight - [ 64, 32, 1, 1], type = f32, size = 0.008 MB\n[ 300/ 678] blk.10.time_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 301/ 678] blk.10.time_mix_key.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 302/ 678] blk.10.time_mix_value.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 303/ 678] blk.10.time_mix_output.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 304/ 678] blk.10.time_mix_gate.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 305/ 678] blk.10.time_mix_ln.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 306/ 678] blk.10.time_mix_ln.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 307/ 678] blk.10.channel_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 308/ 678] blk.10.channel_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 309/ 678] blk.10.channel_mix_key.weight - [ 2048, 7168, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 310/ 678] blk.10.channel_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 311/ 678] blk.10.channel_mix_value.weight - [ 7168, 2048, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 312/ 678] blk.11.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 313/ 678] blk.11.attn_norm.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 314/ 678] blk.11.attn_norm_2.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 315/ 678] blk.11.attn_norm_2.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 316/ 678] blk.11.time_mix_lerp_x.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 317/ 678] blk.11.time_mix_lerp_w.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 318/ 678] blk.11.time_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 319/ 678] blk.11.time_mix_lerp_v.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 320/ 678] blk.11.time_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 321/ 678] blk.11.time_mix_lerp_g.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 322/ 678] blk.11.time_mix_w1.weight - [ 2048, 160, 1, 1], type = f32, size = 1.250 MB\n[ 323/ 678] blk.11.time_mix_w2.weight - [ 32, 2048, 5, 1], type = f32, size = 1.250 MB\n[ 324/ 678] blk.11.time_mix_decay.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 325/ 678] blk.11.time_mix_decay_w1.weight - [ 2048, 64, 1, 1], type = f32, size = 0.500 MB\n[ 326/ 678] blk.11.time_mix_decay_w2.weight - [ 64, 2048, 1, 1], type = f32, size = 0.500 MB\n[ 327/ 678] blk.11.time_mix_first.weight - [ 64, 32, 1, 1], type = f32, size = 0.008 MB\n[ 328/ 678] blk.11.time_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 329/ 678] blk.11.time_mix_key.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 330/ 678] blk.11.time_mix_value.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 331/ 678] blk.11.time_mix_output.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 332/ 678] blk.11.time_mix_gate.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 333/ 678] blk.11.time_mix_ln.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 334/ 678] blk.11.time_mix_ln.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 335/ 678] blk.11.channel_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 336/ 678] blk.11.channel_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 337/ 678] blk.11.channel_mix_key.weight - [ 2048, 7168, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 338/ 678] blk.11.channel_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 339/ 678] blk.11.channel_mix_value.weight - [ 7168, 2048, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 340/ 678] blk.12.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 341/ 678] blk.12.attn_norm.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 342/ 678] blk.12.attn_norm_2.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 343/ 678] blk.12.attn_norm_2.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 344/ 678] blk.12.time_mix_lerp_x.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 345/ 678] blk.12.time_mix_lerp_w.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 346/ 678] blk.12.time_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 347/ 678] blk.12.time_mix_lerp_v.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 348/ 678] blk.12.time_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 349/ 678] blk.12.time_mix_lerp_g.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 350/ 678] blk.12.time_mix_w1.weight - [ 2048, 160, 1, 1], type = f32, size = 1.250 MB\n[ 351/ 678] blk.12.time_mix_w2.weight - [ 32, 2048, 5, 1], type = f32, size = 1.250 MB\n[ 352/ 678] blk.12.time_mix_decay.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 353/ 678] blk.12.time_mix_decay_w1.weight - [ 2048, 64, 1, 1], type = f32, size = 0.500 MB\n[ 354/ 678] blk.12.time_mix_decay_w2.weight - [ 64, 2048, 1, 1], type = f32, size = 0.500 MB\n[ 355/ 678] blk.12.time_mix_first.weight - [ 64, 32, 1, 1], type = f32, size = 0.008 MB\n[ 356/ 678] blk.12.time_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 357/ 678] blk.12.time_mix_key.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 358/ 678] blk.12.time_mix_value.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 359/ 678] blk.12.time_mix_output.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 360/ 678] blk.12.time_mix_gate.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 361/ 678] blk.12.time_mix_ln.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 362/ 678] blk.12.time_mix_ln.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 363/ 678] blk.12.channel_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 364/ 678] blk.12.channel_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 365/ 678] blk.12.channel_mix_key.weight - [ 2048, 7168, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 366/ 678] blk.12.channel_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 367/ 678] blk.12.channel_mix_value.weight - [ 7168, 2048, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 368/ 678] blk.13.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 369/ 678] blk.13.attn_norm.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 370/ 678] blk.13.attn_norm_2.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 371/ 678] blk.13.attn_norm_2.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 372/ 678] blk.13.time_mix_lerp_x.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 373/ 678] blk.13.time_mix_lerp_w.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 374/ 678] blk.13.time_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 375/ 678] blk.13.time_mix_lerp_v.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 376/ 678] blk.13.time_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 377/ 678] blk.13.time_mix_lerp_g.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 378/ 678] blk.13.time_mix_w1.weight - [ 2048, 160, 1, 1], type = f32, size = 1.250 MB\n[ 379/ 678] blk.13.time_mix_w2.weight - [ 32, 2048, 5, 1], type = f32, size = 1.250 MB\n[ 380/ 678] blk.13.time_mix_decay.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 381/ 678] blk.13.time_mix_decay_w1.weight - [ 2048, 64, 1, 1], type = f32, size = 0.500 MB\n[ 382/ 678] blk.13.time_mix_decay_w2.weight - [ 64, 2048, 1, 1], type = f32, size = 0.500 MB\n[ 383/ 678] blk.13.time_mix_first.weight - [ 64, 32, 1, 1], type = f32, size = 0.008 MB\n[ 384/ 678] blk.13.time_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 385/ 678] blk.13.time_mix_key.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 386/ 678] blk.13.time_mix_value.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 387/ 678] blk.13.time_mix_output.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 388/ 678] blk.13.time_mix_gate.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 389/ 678] blk.13.time_mix_ln.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 390/ 678] blk.13.time_mix_ln.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 391/ 678] blk.13.channel_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 392/ 678] blk.13.channel_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 393/ 678] blk.13.channel_mix_key.weight - [ 2048, 7168, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 394/ 678] blk.13.channel_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 395/ 678] blk.13.channel_mix_value.weight - [ 7168, 2048, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 396/ 678] blk.14.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 397/ 678] blk.14.attn_norm.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 398/ 678] blk.14.attn_norm_2.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 399/ 678] blk.14.attn_norm_2.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 400/ 678] blk.14.time_mix_lerp_x.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 401/ 678] blk.14.time_mix_lerp_w.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 402/ 678] blk.14.time_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 403/ 678] blk.14.time_mix_lerp_v.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 404/ 678] blk.14.time_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 405/ 678] blk.14.time_mix_lerp_g.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 406/ 678] blk.14.time_mix_w1.weight - [ 2048, 160, 1, 1], type = f32, size = 1.250 MB\n[ 407/ 678] blk.14.time_mix_w2.weight - [ 32, 2048, 5, 1], type = f32, size = 1.250 MB\n[ 408/ 678] blk.14.time_mix_decay.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 409/ 678] blk.14.time_mix_decay_w1.weight - [ 2048, 64, 1, 1], type = f32, size = 0.500 MB\n[ 410/ 678] blk.14.time_mix_decay_w2.weight - [ 64, 2048, 1, 1], type = f32, size = 0.500 MB\n[ 411/ 678] blk.14.time_mix_first.weight - [ 64, 32, 1, 1], type = f32, size = 0.008 MB\n[ 412/ 678] blk.14.time_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 413/ 678] blk.14.time_mix_key.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 414/ 678] blk.14.time_mix_value.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 415/ 678] blk.14.time_mix_output.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 416/ 678] blk.14.time_mix_gate.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 417/ 678] blk.14.time_mix_ln.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 418/ 678] blk.14.time_mix_ln.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 419/ 678] blk.14.channel_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 420/ 678] blk.14.channel_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 421/ 678] blk.14.channel_mix_key.weight - [ 2048, 7168, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 422/ 678] blk.14.channel_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 423/ 678] blk.14.channel_mix_value.weight - [ 7168, 2048, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 424/ 678] blk.15.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 425/ 678] blk.15.attn_norm.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 426/ 678] blk.15.attn_norm_2.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 427/ 678] blk.15.attn_norm_2.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 428/ 678] blk.15.time_mix_lerp_x.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 429/ 678] blk.15.time_mix_lerp_w.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 430/ 678] blk.15.time_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 431/ 678] blk.15.time_mix_lerp_v.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 432/ 678] blk.15.time_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 433/ 678] blk.15.time_mix_lerp_g.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 434/ 678] blk.15.time_mix_w1.weight - [ 2048, 160, 1, 1], type = f32, size = 1.250 MB\n[ 435/ 678] blk.15.time_mix_w2.weight - [ 32, 2048, 5, 1], type = f32, size = 1.250 MB\n[ 436/ 678] blk.15.time_mix_decay.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 437/ 678] blk.15.time_mix_decay_w1.weight - [ 2048, 64, 1, 1], type = f32, size = 0.500 MB\n[ 438/ 678] blk.15.time_mix_decay_w2.weight - [ 64, 2048, 1, 1], type = f32, size = 0.500 MB\n[ 439/ 678] blk.15.time_mix_first.weight - [ 64, 32, 1, 1], type = f32, size = 0.008 MB\n[ 440/ 678] blk.15.time_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 441/ 678] blk.15.time_mix_key.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 442/ 678] blk.15.time_mix_value.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 443/ 678] blk.15.time_mix_output.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 444/ 678] blk.15.time_mix_gate.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 445/ 678] blk.15.time_mix_ln.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 446/ 678] blk.15.time_mix_ln.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 447/ 678] blk.15.channel_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 448/ 678] blk.15.channel_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 449/ 678] blk.15.channel_mix_key.weight - [ 2048, 7168, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 450/ 678] blk.15.channel_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 451/ 678] blk.15.channel_mix_value.weight - [ 7168, 2048, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 452/ 678] blk.16.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 453/ 678] blk.16.attn_norm.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 454/ 678] blk.16.attn_norm_2.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 455/ 678] blk.16.attn_norm_2.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 456/ 678] blk.16.time_mix_lerp_x.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 457/ 678] blk.16.time_mix_lerp_w.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 458/ 678] blk.16.time_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 459/ 678] blk.16.time_mix_lerp_v.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 460/ 678] blk.16.time_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 461/ 678] blk.16.time_mix_lerp_g.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 462/ 678] blk.16.time_mix_w1.weight - [ 2048, 160, 1, 1], type = f32, size = 1.250 MB\n[ 463/ 678] blk.16.time_mix_w2.weight - [ 32, 2048, 5, 1], type = f32, size = 1.250 MB\n[ 464/ 678] blk.16.time_mix_decay.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 465/ 678] blk.16.time_mix_decay_w1.weight - [ 2048, 64, 1, 1], type = f32, size = 0.500 MB\n[ 466/ 678] blk.16.time_mix_decay_w2.weight - [ 64, 2048, 1, 1], type = f32, size = 0.500 MB\n[ 467/ 678] blk.16.time_mix_first.weight - [ 64, 32, 1, 1], type = f32, size = 0.008 MB\n[ 468/ 678] blk.16.time_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 469/ 678] blk.16.time_mix_key.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 470/ 678] blk.16.time_mix_value.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 471/ 678] blk.16.time_mix_output.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 472/ 678] blk.16.time_mix_gate.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 473/ 678] blk.16.time_mix_ln.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 474/ 678] blk.16.time_mix_ln.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 475/ 678] blk.16.channel_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 476/ 678] blk.16.channel_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 477/ 678] blk.16.channel_mix_key.weight - [ 2048, 7168, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 478/ 678] blk.16.channel_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 479/ 678] blk.16.channel_mix_value.weight - [ 7168, 2048, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 480/ 678] blk.17.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 481/ 678] blk.17.attn_norm.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 482/ 678] blk.17.attn_norm_2.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 483/ 678] blk.17.attn_norm_2.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 484/ 678] blk.17.time_mix_lerp_x.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 485/ 678] blk.17.time_mix_lerp_w.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 486/ 678] blk.17.time_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 487/ 678] blk.17.time_mix_lerp_v.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 488/ 678] blk.17.time_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 489/ 678] blk.17.time_mix_lerp_g.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 490/ 678] blk.17.time_mix_w1.weight - [ 2048, 160, 1, 1], type = f32, size = 1.250 MB\n[ 491/ 678] blk.17.time_mix_w2.weight - [ 32, 2048, 5, 1], type = f32, size = 1.250 MB\n[ 492/ 678] blk.17.time_mix_decay.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 493/ 678] blk.17.time_mix_decay_w1.weight - [ 2048, 64, 1, 1], type = f32, size = 0.500 MB\n[ 494/ 678] blk.17.time_mix_decay_w2.weight - [ 64, 2048, 1, 1], type = f32, size = 0.500 MB\n[ 495/ 678] blk.17.time_mix_first.weight - [ 64, 32, 1, 1], type = f32, size = 0.008 MB\n[ 496/ 678] blk.17.time_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 497/ 678] blk.17.time_mix_key.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 498/ 678] blk.17.time_mix_value.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 499/ 678] blk.17.time_mix_output.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 500/ 678] blk.17.time_mix_gate.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 501/ 678] blk.17.time_mix_ln.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 502/ 678] blk.17.time_mix_ln.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 503/ 678] blk.17.channel_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 504/ 678] blk.17.channel_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 505/ 678] blk.17.channel_mix_key.weight - [ 2048, 7168, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 506/ 678] blk.17.channel_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 507/ 678] blk.17.channel_mix_value.weight - [ 7168, 2048, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 508/ 678] blk.18.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 509/ 678] blk.18.attn_norm.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 510/ 678] blk.18.attn_norm_2.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 511/ 678] blk.18.attn_norm_2.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 512/ 678] blk.18.time_mix_lerp_x.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 513/ 678] blk.18.time_mix_lerp_w.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 514/ 678] blk.18.time_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 515/ 678] blk.18.time_mix_lerp_v.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 516/ 678] blk.18.time_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 517/ 678] blk.18.time_mix_lerp_g.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 518/ 678] blk.18.time_mix_w1.weight - [ 2048, 160, 1, 1], type = f32, size = 1.250 MB\n[ 519/ 678] blk.18.time_mix_w2.weight - [ 32, 2048, 5, 1], type = f32, size = 1.250 MB\n[ 520/ 678] blk.18.time_mix_decay.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 521/ 678] blk.18.time_mix_decay_w1.weight - [ 2048, 64, 1, 1], type = f32, size = 0.500 MB\n[ 522/ 678] blk.18.time_mix_decay_w2.weight - [ 64, 2048, 1, 1], type = f32, size = 0.500 MB\n[ 523/ 678] blk.18.time_mix_first.weight - [ 64, 32, 1, 1], type = f32, size = 0.008 MB\n[ 524/ 678] blk.18.time_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 525/ 678] blk.18.time_mix_key.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 526/ 678] blk.18.time_mix_value.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 527/ 678] blk.18.time_mix_output.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 528/ 678] blk.18.time_mix_gate.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 529/ 678] blk.18.time_mix_ln.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 530/ 678] blk.18.time_mix_ln.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 531/ 678] blk.18.channel_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 532/ 678] blk.18.channel_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 533/ 678] blk.18.channel_mix_key.weight - [ 2048, 7168, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 534/ 678] blk.18.channel_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 535/ 678] blk.18.channel_mix_value.weight - [ 7168, 2048, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 536/ 678] blk.19.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 537/ 678] blk.19.attn_norm.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 538/ 678] blk.19.attn_norm_2.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 539/ 678] blk.19.attn_norm_2.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 540/ 678] blk.19.time_mix_lerp_x.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 541/ 678] blk.19.time_mix_lerp_w.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 542/ 678] blk.19.time_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 543/ 678] blk.19.time_mix_lerp_v.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 544/ 678] blk.19.time_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 545/ 678] blk.19.time_mix_lerp_g.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 546/ 678] blk.19.time_mix_w1.weight - [ 2048, 160, 1, 1], type = f32, size = 1.250 MB\n[ 547/ 678] blk.19.time_mix_w2.weight - [ 32, 2048, 5, 1], type = f32, size = 1.250 MB\n[ 548/ 678] blk.19.time_mix_decay.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 549/ 678] blk.19.time_mix_decay_w1.weight - [ 2048, 64, 1, 1], type = f32, size = 0.500 MB\n[ 550/ 678] blk.19.time_mix_decay_w2.weight - [ 64, 2048, 1, 1], type = f32, size = 0.500 MB\n[ 551/ 678] blk.19.time_mix_first.weight - [ 64, 32, 1, 1], type = f32, size = 0.008 MB\n[ 552/ 678] blk.19.time_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 553/ 678] blk.19.time_mix_key.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 554/ 678] blk.19.time_mix_value.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 555/ 678] blk.19.time_mix_output.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 556/ 678] blk.19.time_mix_gate.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 557/ 678] blk.19.time_mix_ln.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 558/ 678] blk.19.time_mix_ln.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 559/ 678] blk.19.channel_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 560/ 678] blk.19.channel_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 561/ 678] blk.19.channel_mix_key.weight - [ 2048, 7168, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 562/ 678] blk.19.channel_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 563/ 678] blk.19.channel_mix_value.weight - [ 7168, 2048, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 564/ 678] blk.20.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 565/ 678] blk.20.attn_norm.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 566/ 678] blk.20.attn_norm_2.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 567/ 678] blk.20.attn_norm_2.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 568/ 678] blk.20.time_mix_lerp_x.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 569/ 678] blk.20.time_mix_lerp_w.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 570/ 678] blk.20.time_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 571/ 678] blk.20.time_mix_lerp_v.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 572/ 678] blk.20.time_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 573/ 678] blk.20.time_mix_lerp_g.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 574/ 678] blk.20.time_mix_w1.weight - [ 2048, 160, 1, 1], type = f32, size = 1.250 MB\n[ 575/ 678] blk.20.time_mix_w2.weight - [ 32, 2048, 5, 1], type = f32, size = 1.250 MB\n[ 576/ 678] blk.20.time_mix_decay.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 577/ 678] blk.20.time_mix_decay_w1.weight - [ 2048, 64, 1, 1], type = f32, size = 0.500 MB\n[ 578/ 678] blk.20.time_mix_decay_w2.weight - [ 64, 2048, 1, 1], type = f32, size = 0.500 MB\n[ 579/ 678] blk.20.time_mix_first.weight - [ 64, 32, 1, 1], type = f32, size = 0.008 MB\n[ 580/ 678] blk.20.time_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 581/ 678] blk.20.time_mix_key.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 582/ 678] blk.20.time_mix_value.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 583/ 678] blk.20.time_mix_output.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 584/ 678] blk.20.time_mix_gate.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 585/ 678] blk.20.time_mix_ln.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 586/ 678] blk.20.time_mix_ln.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 587/ 678] blk.20.channel_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 588/ 678] blk.20.channel_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 589/ 678] blk.20.channel_mix_key.weight - [ 2048, 7168, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 590/ 678] blk.20.channel_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 591/ 678] blk.20.channel_mix_value.weight - [ 7168, 2048, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 592/ 678] blk.21.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 593/ 678] blk.21.attn_norm.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 594/ 678] blk.21.attn_norm_2.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 595/ 678] blk.21.attn_norm_2.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 596/ 678] blk.21.time_mix_lerp_x.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 597/ 678] blk.21.time_mix_lerp_w.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 598/ 678] blk.21.time_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 599/ 678] blk.21.time_mix_lerp_v.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 600/ 678] blk.21.time_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 601/ 678] blk.21.time_mix_lerp_g.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 602/ 678] blk.21.time_mix_w1.weight - [ 2048, 160, 1, 1], type = f32, size = 1.250 MB\n[ 603/ 678] blk.21.time_mix_w2.weight - [ 32, 2048, 5, 1], type = f32, size = 1.250 MB\n[ 604/ 678] blk.21.time_mix_decay.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 605/ 678] blk.21.time_mix_decay_w1.weight - [ 2048, 64, 1, 1], type = f32, size = 0.500 MB\n[ 606/ 678] blk.21.time_mix_decay_w2.weight - [ 64, 2048, 1, 1], type = f32, size = 0.500 MB\n[ 607/ 678] blk.21.time_mix_first.weight - [ 64, 32, 1, 1], type = f32, size = 0.008 MB\n[ 608/ 678] blk.21.time_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 609/ 678] blk.21.time_mix_key.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 610/ 678] blk.21.time_mix_value.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 611/ 678] blk.21.time_mix_output.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 612/ 678] blk.21.time_mix_gate.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 613/ 678] blk.21.time_mix_ln.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 614/ 678] blk.21.time_mix_ln.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 615/ 678] blk.21.channel_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 616/ 678] blk.21.channel_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 617/ 678] blk.21.channel_mix_key.weight - [ 2048, 7168, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 618/ 678] blk.21.channel_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 619/ 678] blk.21.channel_mix_value.weight - [ 7168, 2048, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 620/ 678] blk.22.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 621/ 678] blk.22.attn_norm.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 622/ 678] blk.22.attn_norm_2.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 623/ 678] blk.22.attn_norm_2.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 624/ 678] blk.22.time_mix_lerp_x.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 625/ 678] blk.22.time_mix_lerp_w.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 626/ 678] blk.22.time_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 627/ 678] blk.22.time_mix_lerp_v.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 628/ 678] blk.22.time_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 629/ 678] blk.22.time_mix_lerp_g.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 630/ 678] blk.22.time_mix_w1.weight - [ 2048, 160, 1, 1], type = f32, size = 1.250 MB\n[ 631/ 678] blk.22.time_mix_w2.weight - [ 32, 2048, 5, 1], type = f32, size = 1.250 MB\n[ 632/ 678] blk.22.time_mix_decay.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 633/ 678] blk.22.time_mix_decay_w1.weight - [ 2048, 64, 1, 1], type = f32, size = 0.500 MB\n[ 634/ 678] blk.22.time_mix_decay_w2.weight - [ 64, 2048, 1, 1], type = f32, size = 0.500 MB\n[ 635/ 678] blk.22.time_mix_first.weight - [ 64, 32, 1, 1], type = f32, size = 0.008 MB\n[ 636/ 678] blk.22.time_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 637/ 678] blk.22.time_mix_key.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 638/ 678] blk.22.time_mix_value.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 639/ 678] blk.22.time_mix_output.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 640/ 678] blk.22.time_mix_gate.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 641/ 678] blk.22.time_mix_ln.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 642/ 678] blk.22.time_mix_ln.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 643/ 678] blk.22.channel_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 644/ 678] blk.22.channel_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 645/ 678] blk.22.channel_mix_key.weight - [ 2048, 7168, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 646/ 678] blk.22.channel_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 647/ 678] blk.22.channel_mix_value.weight - [ 7168, 2048, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 648/ 678] blk.23.attn_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 649/ 678] blk.23.attn_norm.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 650/ 678] blk.23.attn_norm_2.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 651/ 678] blk.23.attn_norm_2.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 652/ 678] blk.23.time_mix_lerp_x.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 653/ 678] blk.23.time_mix_lerp_w.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 654/ 678] blk.23.time_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 655/ 678] blk.23.time_mix_lerp_v.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 656/ 678] blk.23.time_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 657/ 678] blk.23.time_mix_lerp_g.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 658/ 678] blk.23.time_mix_w1.weight - [ 2048, 160, 1, 1], type = f32, size = 1.250 MB\n[ 659/ 678] blk.23.time_mix_w2.weight - [ 32, 2048, 5, 1], type = f32, size = 1.250 MB\n[ 660/ 678] blk.23.time_mix_decay.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 661/ 678] blk.23.time_mix_decay_w1.weight - [ 2048, 64, 1, 1], type = f32, size = 0.500 MB\n[ 662/ 678] blk.23.time_mix_decay_w2.weight - [ 64, 2048, 1, 1], type = f32, size = 0.500 MB\n[ 663/ 678] blk.23.time_mix_first.weight - [ 64, 32, 1, 1], type = f32, size = 0.008 MB\n[ 664/ 678] blk.23.time_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 665/ 678] blk.23.time_mix_key.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 666/ 678] blk.23.time_mix_value.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 667/ 678] blk.23.time_mix_output.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 668/ 678] blk.23.time_mix_gate.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 669/ 678] blk.23.time_mix_ln.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 670/ 678] blk.23.time_mix_ln.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 671/ 678] blk.23.channel_mix_lerp_k.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 672/ 678] blk.23.channel_mix_lerp_r.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 673/ 678] blk.23.channel_mix_key.weight - [ 2048, 7168, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 674/ 678] blk.23.channel_mix_receptance.weight - [ 2048, 2048, 1, 1], type = f16, converting to q2_K .. size = 8.00 MiB -> 1.31 MiB\n[ 675/ 678] blk.23.channel_mix_value.weight - [ 7168, 2048, 1, 1], type = f16, converting to q2_K .. size = 28.00 MiB -> 4.59 MiB\n[ 676/ 678] output_norm.weight - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 677/ 678] output_norm.bias - [ 2048, 1, 1, 1], type = f32, size = 0.008 MB\n[ 678/ 678] output.weight - [ 2048, 65536, 1, 1], type = f16, converting to q6_K .. size = 256.00 MiB -> 105.00 MiB\nllama_model_quantize_internal: model size = 3095.03 MB\nllama_model_quantize_internal: quant size = 643.53 MB\n\nmain: quantize time = 100331.91 ms\nmain: total time = 100331.91 ms\n","output_type":"stream"}]},{"cell_type":"code","source":"#!./llama-quantize","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"vdAKSgnNS0-e","executionInfo":{"status":"ok","timestamp":1725256398962,"user_tz":-60,"elapsed":275,"user":{"displayName":"Lyte","userId":"00368277356076556155"}},"outputId":"f35500d3-68fd-45e3-fdc1-19335d6c3a6b","scrolled":true},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":"usage: ./llama-quantize [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n\n\n --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n\n --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n\n --pure: Disable k-quant mixtures and quantize all tensors to the same type\n\n --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n\n --include-weights tensor_name: use importance matrix for this/these tensor(s)\n\n --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n\n --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n\n --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n\n --keep-split: will generate quantized model in the same shards as input\n\n --override-kv KEY=TYPE:VALUE\n\n Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n\nNote: --include-weights and --exclude-weights cannot be used together\n\n\n\nAllowed quantization types:\n\n 2 or Q4_0 : 4.34G, +0.4685 ppl @ Llama-3-8B\n\n 3 or Q4_1 : 4.78G, +0.4511 ppl @ Llama-3-8B\n\n 8 or Q5_0 : 5.21G, +0.1316 ppl @ Llama-3-8B\n\n 9 or Q5_1 : 5.65G, +0.1062 ppl @ Llama-3-8B\n\n 19 or IQ2_XXS : 2.06 bpw quantization\n\n 20 or IQ2_XS : 2.31 bpw quantization\n\n 28 or IQ2_S : 2.5 bpw quantization\n\n 29 or IQ2_M : 2.7 bpw quantization\n\n 24 or IQ1_S : 1.56 bpw quantization\n\n 31 or IQ1_M : 1.75 bpw quantization\n\n 10 or Q2_K : 2.96G, +3.5199 ppl @ Llama-3-8B\n\n 21 or Q2_K_S : 2.96G, +3.1836 ppl @ Llama-3-8B\n\n 23 or IQ3_XXS : 3.06 bpw quantization\n\n 26 or IQ3_S : 3.44 bpw quantization\n\n 27 or IQ3_M : 3.66 bpw quantization mix\n\n 12 or Q3_K : alias for Q3_K_M\n\n 22 or IQ3_XS : 3.3 bpw quantization\n\n 11 or Q3_K_S : 3.41G, +1.6321 ppl @ Llama-3-8B\n\n 12 or Q3_K_M : 3.74G, +0.6569 ppl @ Llama-3-8B\n\n 13 or Q3_K_L : 4.03G, +0.5562 ppl @ Llama-3-8B\n\n 25 or IQ4_NL : 4.50 bpw non-linear quantization\n\n 30 or IQ4_XS : 4.25 bpw non-linear quantization\n\n 15 or Q4_K : alias for Q4_K_M\n\n 14 or Q4_K_S : 4.37G, +0.2689 ppl @ Llama-3-8B\n\n 15 or Q4_K_M : 4.58G, +0.1754 ppl @ Llama-3-8B\n\n 17 or Q5_K : alias for Q5_K_M\n\n 16 or Q5_K_S : 5.21G, +0.1049 ppl @ Llama-3-8B\n\n 17 or Q5_K_M : 5.33G, +0.0569 ppl @ Llama-3-8B\n\n 18 or Q6_K : 6.14G, +0.0217 ppl @ Llama-3-8B\n\n 7 or Q8_0 : 7.96G, +0.0026 ppl @ Llama-3-8B\n\n 33 or Q4_0_4_4 : 4.34G, +0.4685 ppl @ Llama-3-8B\n\n 34 or Q4_0_4_8 : 4.34G, +0.4685 ppl @ Llama-3-8B\n\n 35 or Q4_0_8_8 : 4.34G, +0.4685 ppl @ Llama-3-8B\n\n 1 or F16 : 14.00G, +0.0020 ppl @ Mistral-7B\n\n 32 or BF16 : 14.00G, -0.0050 ppl @ Mistral-7B\n\n 0 or F32 : 26.00G @ 7B\n\n COPY : only copy tensors, no quantizing\n"}]},{"cell_type":"code","source":"#create_repo(\"Lyte/RWKV-6-World-3B-v2.1-GGUF\", token=HF_TOKEN)\n\nupload_file(\n path_or_fileobj=\"./model/RWKV-6-World-1.6B-GGUF-Q2_K.gguf\",\n path_in_repo=\"RWKV-6-World-1.6B-GGUF-Q2_K.gguf\",\n repo_id=\"Lyte/RWKV-6-World-1.6B-GGUF\",\n repo_type=\"model\",\n token=HF_TOKEN,\n)","metadata":{"id":"gEq9apcVEgAJ","execution":{"iopub.status.busy":"2024-09-16T16:26:43.503822Z","iopub.execute_input":"2024-09-16T16:26:43.504408Z","iopub.status.idle":"2024-09-16T16:27:05.656307Z","shell.execute_reply.started":"2024-09-16T16:26:43.504350Z","shell.execute_reply":"2024-09-16T16:27:05.654986Z"},"trusted":true},"execution_count":38,"outputs":[{"output_type":"display_data","data":{"text/plain":"RWKV-6-World-1.6B-GGUF-Q2_K.gguf: 0%| | 0.00/676M [00:00\", \"\\\\x00\", \"\\\\x01\", \"\\\\x02\", \"\\...\nllama_model_loader: - kv 17: tokenizer.ggml.token_type arr[i32,65536] = [3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...\nllama_model_loader: - kv 18: tokenizer.ggml.bos_token_id u32 = 0\nllama_model_loader: - kv 19: tokenizer.ggml.eos_token_id u32 = 0\nllama_model_loader: - kv 20: general.quantization_version u32 = 2\nllama_model_loader: - type f32: 484 tensors\nllama_model_loader: - type f16: 194 tensors\nllm_load_vocab: special tokens cache size = 1\nllm_load_vocab: token to piece cache size = 0.3561 MB\nllm_load_print_meta: format = GGUF V3 (latest)\nllm_load_print_meta: arch = rwkv6\nllm_load_print_meta: vocab type = RWKV\nllm_load_print_meta: n_vocab = 65536\nllm_load_print_meta: n_merges = 0\nllm_load_print_meta: vocab_only = 0\nllm_load_print_meta: n_ctx_train = 1048576\nllm_load_print_meta: n_embd = 2048\nllm_load_print_meta: n_layer = 24\nllm_load_print_meta: n_head = 0\nllm_load_print_meta: n_head_kv = 0\nllm_load_print_meta: n_rot = 0\nllm_load_print_meta: n_swa = 0\nllm_load_print_meta: n_embd_head_k = 0\nllm_load_print_meta: n_embd_head_v = 0\nllm_load_print_meta: n_gqa = 0\nllm_load_print_meta: n_embd_k_gqa = 0\nllm_load_print_meta: n_embd_v_gqa = 0\nllm_load_print_meta: f_norm_eps = 1.0e-05\nllm_load_print_meta: f_norm_rms_eps = 0.0e+00\nllm_load_print_meta: f_clamp_kqv = 0.0e+00\nllm_load_print_meta: f_max_alibi_bias = 0.0e+00\nllm_load_print_meta: f_logit_scale = 0.0e+00\nllm_load_print_meta: n_ff = 7168\nllm_load_print_meta: n_expert = 0\nllm_load_print_meta: n_expert_used = 0\nllm_load_print_meta: causal attn = 1\nllm_load_print_meta: pooling type = 0\nllm_load_print_meta: rope type = -1\nllm_load_print_meta: rope scaling = linear\nllm_load_print_meta: freq_base_train = 10000.0\nllm_load_print_meta: freq_scale_train = 1\nllm_load_print_meta: n_ctx_orig_yarn = 1048576\nllm_load_print_meta: rope_finetuned = unknown\nllm_load_print_meta: ssm_d_conv = 0\nllm_load_print_meta: ssm_d_inner = 0\nllm_load_print_meta: ssm_d_state = 0\nllm_load_print_meta: ssm_dt_rank = 0\nllm_load_print_meta: ssm_dt_b_c_rms = 0\nllm_load_print_meta: model type = 1.6B\nllm_load_print_meta: model ftype = F16\nllm_load_print_meta: model params = 1.60 B\nllm_load_print_meta: model size = 3.02 GiB (16.23 BPW) \nllm_load_print_meta: general.name = Model\nllm_load_print_meta: BOS token = 0 ''\nllm_load_print_meta: EOS token = 0 ''\nllm_load_print_meta: LF token = 11 '\\n'\nllm_load_print_meta: max token length = 192\nllm_load_tensors: ggml ctx size = 0.26 MiB\nllm_load_tensors: CPU buffer size = 3095.03 MiB\n......................................................................................\nllama_new_context_with_model: n_ctx = 1024\nllama_new_context_with_model: n_batch = 1024\nllama_new_context_with_model: n_ubatch = 512\nllama_new_context_with_model: flash_attn = 0\nllama_new_context_with_model: freq_base = 10000.0\nllama_new_context_with_model: freq_scale = 1\nllama_kv_cache_init: CPU KV buffer size = 12.38 MiB\nllama_new_context_with_model: KV self size = 12.38 MiB, K (f32): 0.38 MiB, V (f32): 12.00 MiB\nllama_new_context_with_model: CPU output buffer size = 0.25 MiB\nllama_new_context_with_model: CPU compute buffer size = 136.00 MiB\nllama_new_context_with_model: graph nodes = 2726\nllama_new_context_with_model: graph splits = 1\nllama_init_from_gpt_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)\nmain: llama threadpool init, n_threads = 4\n\nsystem_info: n_threads = 4 (n_threads_batch = 4) / 4 | AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | RISCV_VECT = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | \n\nsampler seed: 3259106188\nsampler params: \n\trepeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000\n\ttop_k = 50, tfs_z = 1.000, top_p = 0.950, min_p = 0.050, typical_p = 1.000, temp = 0.800\n\tmirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000\nsampler chain: logits -> logit-bias -> penalties -> top-k -> tail-free -> typical -> top-p -> min-p -> temp-ext -> softmax -> dist \ngenerate: n_ctx = 1024, n_batch = 2048, n_predict = 128, n_keep = 0\n\nAssistant: Hello, what can i help you with today?\nUser:\n\nllama_perf_sampler_print: sampling time = 0.00 ms / 16 runs ( 0.00 ms per token, 4000000.00 tokens per second)\nllama_perf_context_print: load time = 861.50 ms\nllama_perf_context_print: prompt eval time = 0.00 ms / 1 tokens ( 0.00 ms per token, inf tokens per second)\nllama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)\nllama_perf_context_print: total time = 127.06 ms / 2 tokens\n","output_type":"stream"}]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}