params.ctx_shift = false;
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
+ add_opt(common_arg(
+ {"--context-shift"},
+ string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
+ [](common_params & params) {
+ params.ctx_shift = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
add_opt(common_arg(
{"--chunks"}, "N",
string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool flash_attn = false; // flash attention
bool no_perf = false; // disable performance metrics
- bool ctx_shift = true; // context shift on inifinite text generation
+ bool ctx_shift = false; // context shift on inifinite text generation
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
bool kv_unified = false; // enable unified KV cache
server = ServerPreset.tinyllama2()
-@pytest.fixture(scope="module", autouse=True)
+@pytest.fixture(autouse=True)
def create_server():
global server
server = ServerPreset.tinyllama2()
server = ServerPreset.tinyllama2()
-@pytest.fixture(scope="module", autouse=True)
+@pytest.fixture(autouse=True)
def create_server():
global server
server = ServerPreset.tinyllama2()
"temperature": 1.0,
"cache_prompt": False,
})
- assert res.status_code == 200
+ assert res.status_code == 400
def test_completion_with_tokens_input():
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
""".strip()
-@pytest.fixture(scope="module", autouse=True)
+@pytest.fixture(autouse=True)
def create_server():
global server
server = ServerPreset.tinyllama2()
# the prompt is truncated to keep the last 109 tokens
# 64 tokens are generated thanks to shifting the context when it gets full
global server
+ server.enable_ctx_shift = True
server.start()
res = server.make_request("POST", "/completion", data={
"n_predict": 64,
])
def test_ctx_shift_disabled_short_prompt(n_predict: int, n_token_output: int, truncated: bool):
global server
- server.disable_ctx_shift = True
server.n_predict = -1
server.start()
res = server.make_request("POST", "/completion", data={
def test_ctx_shift_disabled_long_prompt():
global server
- server.disable_ctx_shift = True
server.start()
res = server.make_request("POST", "/completion", data={
"n_predict": 64,
def test_ctx_shift_disabled_stream():
global server
- server.disable_ctx_shift = True
server.start()
res = server.make_stream_request("POST", "/v1/completions", data={
"n_predict": 256,
EPSILON = 1e-3
-@pytest.fixture(scope="module", autouse=True)
+@pytest.fixture(autouse=True)
def create_server():
global server
server = ServerPreset.bert_bge_small()
server = ServerPreset.tinyllama_infill()
-@pytest.fixture(scope="module", autouse=True)
+@pytest.fixture(autouse=True)
def create_server():
global server
server = ServerPreset.tinyllama_infill()
LORA_FILE_URL = "https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf"
-@pytest.fixture(scope="module", autouse=True)
+@pytest.fixture(autouse=True)
def create_server():
global server
server = ServerPreset.stories15m_moe()
server = ServerPreset.jina_reranker_tiny()
-@pytest.fixture(scope="module", autouse=True)
+@pytest.fixture(autouse=True)
def create_server():
global server
server = ServerPreset.jina_reranker_tiny()
TEST_API_KEY = "sk-this-is-the-secret-key"
-@pytest.fixture(scope="module", autouse=True)
+@pytest.fixture(autouse=True)
def create_server():
global server
server = ServerPreset.tinyllama2()
server = ServerPreset.tinyllama2()
-@pytest.fixture(scope="module", autouse=True)
+@pytest.fixture(autouse=True)
def create_server():
global server
server = ServerPreset.tinyllama2()
server.draft_max = 8
-@pytest.fixture(scope="module", autouse=True)
+@pytest.fixture(autouse=True)
def fixture_create_server():
return create_server()
def test_with_ctx_shift():
global server
server.n_ctx = 64
+ server.enable_ctx_shift = True
server.start()
res = server.make_request("POST", "/completion", data={
"prompt": "Hello " * 56,
server = ServerPreset.tinyllama2()
-@pytest.fixture(scope="module", autouse=True)
+@pytest.fixture(autouse=True)
def create_server():
global server
server = ServerPreset.tinyllama2()
server.model_alias = "tinyllama-2-tool-call"
server.server_port = 8081
server.n_slots = 1
+ server.n_ctx = 8192
+ server.n_batch = 2048
class CompletionMode(Enum):
NORMAL = "normal"
draft: int | None = None
api_key: str | None = None
lora_files: List[str] | None = None
- disable_ctx_shift: int | None = False
+ enable_ctx_shift: int | None = False
draft_min: int | None = None
draft_max: int | None = None
no_webui: bool | None = None
if self.lora_files:
for lora_file in self.lora_files:
server_args.extend(["--lora", lora_file])
- if self.disable_ctx_shift:
- server_args.extend(["--no-context-shift"])
+ if self.enable_ctx_shift:
+ server_args.append("--context-shift")
if self.api_key:
server_args.extend(["--api-key", self.api_key])
if self.draft_max:
params.model = params.vocoder.model;
params.embedding = true;
- params.ctx_shift = false; // silence warning
params.n_ubatch = params.n_batch;
common_init_result llama_init_cts = common_init_from_params(params);