* change KV cache to fp16 to take advantage of tensor cores
* added a note/comment to indicate kv can be FP16
const int n_mem = n_layer*n_ctx;
const int n_elements = n_embd*n_mem;
+ // k and v here can also be GGML_TYPE_F16 to save memory and speed up the computation
+ // if backend supports it
model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);