Skip to content

Commit

Permalink
speculative : update default params (#11954)
Browse files Browse the repository at this point in the history
* speculative : update default params

* speculative : do not discard the last drafted token
  • Loading branch information
ggerganov authored Feb 19, 2025
1 parent 9626d93 commit abd4d0b
Show file tree
Hide file tree
Showing 4 changed files with 9 additions and 9 deletions.
4 changes: 2 additions & 2 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -178,10 +178,10 @@ struct common_params_speculative {

int32_t n_ctx = 0; // draft context size
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
float p_split = 0.1f; // speculative decoding split probability
float p_min = 0.9f; // minimum speculative decoding probability (greedy)
float p_min = 0.75f; // minimum speculative decoding probability (greedy)

struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;
Expand Down
10 changes: 5 additions & 5 deletions common/speculative.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -252,11 +252,6 @@ llama_tokens common_speculative_gen_draft(
// add drafted token for each sequence
const llama_token id = cur_p->data[0].id;

// only collect very high-confidence draft tokens
if (cur_p->data[0].p < params.p_min) {
break;
}

common_sampler_accept(smpl, id, true);

result.push_back(id);
Expand All @@ -265,6 +260,11 @@ llama_tokens common_speculative_gen_draft(
break;
}

// only collect very high-confidence draft tokens
if (cur_p->data[0].p < params.p_min) {
break;
}

common_batch_add(batch, id, n_past + i + 1, { 0 }, true);

// evaluate the drafted tokens on the draft model
Expand Down
2 changes: 1 addition & 1 deletion common/speculative.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ struct common_speculative_params {
int n_draft = 16; // max drafted tokens
int n_reuse = 256;

float p_min = 0.9f; // min probability required to accept a token in the draft
float p_min = 0.75f; // min probability required to accept a token in the draft
};

struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
Expand Down
2 changes: 1 addition & 1 deletion examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ struct server_task {
params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);

params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min);
params.speculative.n_min = std::max(params.speculative.n_min, 2);
params.speculative.n_min = std::max(params.speculative.n_min, 0);
params.speculative.n_max = std::max(params.speculative.n_max, 0);

// Use OpenAI API logprobs only if n_probs wasn't provided
Expand Down

0 comments on commit abd4d0b

Please sign in to comment.