From f39de4654897c5faf3e34c8917b0e79e62e10f37 Mon Sep 17 00:00:00 2001
From: JADDYK <12012211@mail.sustech.edu.cn>
Date: Fri, 7 Feb 2025 22:15:18 +0800
Subject: [PATCH 01/10] support sglang

---
 src/lighteval/__main__.py                   |   3 +
 src/lighteval/main_sglang.py                | 163 +++++++
 src/lighteval/main_vllm.py                  |   3 +
 src/lighteval/models/model_input.py         |   5 +
 src/lighteval/models/model_loader.py        |  15 +-
 src/lighteval/models/sglang/sglang_model.py | 456 ++++++++++++++++++++
 src/lighteval/models/vllm/vllm_model.py     |   5 +-
 src/lighteval/pipeline.py                   |   8 +-
 src/lighteval/tasks/registry.py             |   1 +
 src/lighteval/utils/imports.py              |   6 +-
 10 files changed, 660 insertions(+), 5 deletions(-)
 create mode 100644 src/lighteval/main_sglang.py
 create mode 100644 src/lighteval/models/sglang/sglang_model.py

diff --git a/src/lighteval/__main__.py b/src/lighteval/__main__.py
index e4053813e..77312f57c 100644
--- a/src/lighteval/__main__.py
+++ b/src/lighteval/__main__.py
@@ -31,6 +31,7 @@
 import lighteval.main_nanotron
 import lighteval.main_tasks
 import lighteval.main_vllm
+import lighteval.main_sglang
 
 
 app = typer.Typer()
@@ -63,7 +64,9 @@
 app.command(rich_help_panel="Evaluation Backends")(lighteval.main_accelerate.accelerate)
 app.command(rich_help_panel="Evaluation Utils")(lighteval.main_baseline.baseline)
 app.command(rich_help_panel="Evaluation Backends")(lighteval.main_nanotron.nanotron)
+# Jayon02: add vllm cmd
 app.command(rich_help_panel="Evaluation Backends")(lighteval.main_vllm.vllm)
+app.command(rich_help_panel="Evaluation Backends")(lighteval.main_sglang.sglang)
 app.add_typer(
     lighteval.main_endpoint.app,
     name="endpoint",
diff --git a/src/lighteval/main_sglang.py b/src/lighteval/main_sglang.py
new file mode 100644
index 000000000..0c662a9dd
--- /dev/null
+++ b/src/lighteval/main_sglang.py
@@ -0,0 +1,163 @@
+# TODO: change to what?
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import os
+from typing import Optional
+
+from typer import Argument, Option
+from typing_extensions import Annotated
+
+
+TOKEN = os.getenv("HF_TOKEN")
+CACHE_DIR: str = os.getenv("HF_HOME", "/scratch")
+
+HELP_PANEL_NAME_1 = "Common Parameters"
+HELP_PANEL_NAME_2 = "Logging Parameters"
+HELP_PANEL_NAME_3 = "Debug Parameters"
+HELP_PANEL_NAME_4 = "Modeling Parameters"
+
+# TODO: change
+def sglang(
+    # === general ===
+    model_args: Annotated[
+        str,
+        Argument(
+            help="Model arguments in the form key1=value1,key2=value2,... or path to yaml config file (see examples/model_configs/transformers_model.yaml)"
+        ),
+    ],
+    tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")],
+    # === Common parameters ===
+    use_chat_template: Annotated[
+        bool, Option(help="Use chat template for evaluation.", rich_help_panel=HELP_PANEL_NAME_4)
+    ] = False,
+    system_prompt: Annotated[
+        Optional[str], Option(help="Use system prompt for evaluation.", rich_help_panel=HELP_PANEL_NAME_4)
+    ] = None,
+    dataset_loading_processes: Annotated[
+        int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1)
+    ] = 1,
+    custom_tasks: Annotated[
+        Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1)
+    ] = None,
+    cache_dir: Annotated[
+        str, Option(help="Cache directory for datasets and models.", rich_help_panel=HELP_PANEL_NAME_1)
+    ] = CACHE_DIR,
+    num_fewshot_seeds: Annotated[
+        int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1)
+    ] = 1,
+    load_responses_from_details_date_id: Annotated[
+        Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1)
+    ] = None,
+    # === saving ===
+    output_dir: Annotated[
+        str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
+    ] = "results",
+    push_to_hub: Annotated[
+        bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2)
+    ] = False,
+    push_to_tensorboard: Annotated[
+        bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2)
+    ] = False,
+    public_run: Annotated[
+        bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2)
+    ] = False,
+    results_org: Annotated[
+        Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2)
+    ] = None,
+    save_details: Annotated[
+        bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2)
+    ] = False,
+    # === debug ===
+    max_samples: Annotated[
+        Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3)
+    ] = None,
+    job_id: Annotated[
+        int, Option(help="Optional job id for future reference.", rich_help_panel=HELP_PANEL_NAME_3)
+    ] = 0,
+):
+    """
+    Evaluate models using vllm as backend.
+    """
+    import yaml
+
+    from lighteval.logging.evaluation_tracker import EvaluationTracker
+    from lighteval.models.model_input import GenerationParameters
+    from lighteval.models.sglang.sglang_model import SGLANGModelConfig
+    from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters
+
+    TOKEN = os.getenv("HF_TOKEN")
+
+    env_config = EnvConfig(token=TOKEN, cache_dir=cache_dir)
+
+    evaluation_tracker = EvaluationTracker(
+        output_dir=output_dir,
+        save_details=save_details,
+        push_to_hub=push_to_hub,
+        push_to_tensorboard=push_to_tensorboard,
+        public=public_run,
+        hub_results_org=results_org,
+    )
+
+    ## Jayon02: vllm pipeline parameter
+    pipeline_params = PipelineParameters(
+        launcher_type=ParallelismManager.SGLANG,
+        env_config=env_config,
+        job_id=job_id,
+        dataset_loading_processes=dataset_loading_processes,
+        custom_tasks_directory=custom_tasks,
+        override_batch_size=-1,
+        num_fewshot_seeds=num_fewshot_seeds,
+        max_samples=max_samples,
+        use_chat_template=use_chat_template,
+        system_prompt=system_prompt,
+        load_responses_from_details_date_id=load_responses_from_details_date_id,
+    )
+
+    ## Jayon02: support two ways to load model
+    if model_args.endswith(".yaml"):
+        with open(model_args, "r") as f:
+            config = yaml.safe_load(f)["model"]
+        generation_parameters = GenerationParameters.from_dict(config)
+        model_config = SGLANGModelConfig(config, generation_parameters=generation_parameters)
+
+    else:
+        ## cmd arg
+        model_args_dict: dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in model_args.split(",")}
+
+        model_config = SGLANGModelConfig(**model_args_dict)
+
+    pipeline = Pipeline(
+        tasks=tasks,
+        pipeline_parameters=pipeline_params,
+        evaluation_tracker=evaluation_tracker,
+        model_config=model_config,
+    )
+
+    pipeline.evaluate()
+
+    pipeline.show_results()
+
+    results = pipeline.get_results()
+
+    pipeline.save_and_push_results()
+
+    return results
diff --git a/src/lighteval/main_vllm.py b/src/lighteval/main_vllm.py
index d063c3fa8..90132dcba 100644
--- a/src/lighteval/main_vllm.py
+++ b/src/lighteval/main_vllm.py
@@ -116,6 +116,7 @@ def vllm(
         hub_results_org=results_org,
     )
 
+    ## Jayon02: vllm pipeline parameter
     pipeline_params = PipelineParameters(
         launcher_type=ParallelismManager.VLLM,
         env_config=env_config,
@@ -130,6 +131,7 @@ def vllm(
         load_responses_from_details_date_id=load_responses_from_details_date_id,
     )
 
+    ## Jayon02: support two ways to load model
     if model_args.endswith(".yaml"):
         with open(model_args, "r") as f:
             config = yaml.safe_load(f)["model"]
@@ -137,6 +139,7 @@ def vllm(
         model_config = VLLMModelConfig(config, generation_parameters=generation_parameters)
 
     else:
+        ## cmd arg
         model_args_dict: dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in model_args.split(",")}
         model_config = VLLMModelConfig(**model_args_dict)
 
diff --git a/src/lighteval/models/model_input.py b/src/lighteval/models/model_input.py
index 04e35be17..687a561a3 100644
--- a/src/lighteval/models/model_input.py
+++ b/src/lighteval/models/model_input.py
@@ -119,3 +119,8 @@ def to_tgi_ie_dict(self) -> dict:
             "truncate": self.truncate_prompt,
         }
         return {k: v for k, v in args.items() if v is not None}
+
+    # TODO first: sampling parameter
+    def to_sglang_dict(self) -> dict:
+        return {k: v for k, v in asdict(self).items() if v is not None}
+
diff --git a/src/lighteval/models/model_loader.py b/src/lighteval/models/model_loader.py
index 68835fda7..3bf76af7d 100644
--- a/src/lighteval/models/model_loader.py
+++ b/src/lighteval/models/model_loader.py
@@ -32,6 +32,7 @@
 from lighteval.models.endpoints.openai_model import OpenAIClient, OpenAIModelConfig
 from lighteval.models.endpoints.tgi_model import ModelClient, TGIModelConfig
 from lighteval.models.litellm_model import LiteLLMClient, LiteLLMModelConfig
+from lighteval.models.sglang.sglang_model import SGLANGModelConfig, SGLANGModel
 from lighteval.models.transformers.adapter_model import AdapterModel, AdapterModelConfig
 from lighteval.models.transformers.delta_model import DeltaModel, DeltaModelConfig
 from lighteval.models.transformers.transformers_model import TransformersModel, TransformersModelConfig
@@ -43,7 +44,7 @@
     is_litellm_available,
     is_openai_available,
     is_tgi_available,
-    is_vllm_available,
+    is_vllm_available, is_sglang_available, NO_SGLANG_ERROR_MSG,
 )
 from lighteval.utils.utils import EnvConfig
 
@@ -62,6 +63,7 @@ def load_model(  # noqa: C901
         VLLMModelConfig,
         OpenAIModelConfig,
         LiteLLMModelConfig,
+        SGLANGModelConfig,
     ],
     env_config: EnvConfig,
 ) -> Union[TransformersModel, AdapterModel, DeltaModel, ModelClient, DummyModel]:
@@ -96,6 +98,11 @@ def load_model(  # noqa: C901
     if isinstance(config, VLLMModelConfig):
         return load_model_with_accelerate_or_default(config=config, env_config=env_config)
 
+    if isinstance(config, SGLANGModelConfig):
+        # TODO: double check
+        # return load_model_with_accelerate_or_default(config=config, env_config=env_config)
+        return load_sglang_model(config=config, env_config=env_config)
+
     if isinstance(config, OpenAIModelConfig):
         return load_openai_model(config=config, env_config=env_config)
 
@@ -159,3 +166,9 @@ def load_model_with_accelerate_or_default(
 
 def load_dummy_model(config: DummyModelConfig, env_config: EnvConfig):
     return DummyModel(config=config, env_config=env_config)
+
+def load_sglang_model(config: SGLANGModelConfig, env_config: EnvConfig):
+    if not is_sglang_available():
+        raise ImportError(NO_SGLANG_ERROR_MSG)
+
+    return SGLANGModel(config=config, env_config=env_config)
diff --git a/src/lighteval/models/sglang/sglang_model.py b/src/lighteval/models/sglang/sglang_model.py
new file mode 100644
index 000000000..6b81d2a8f
--- /dev/null
+++ b/src/lighteval/models/sglang/sglang_model.py
@@ -0,0 +1,456 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import gc
+import itertools
+import logging
+import os
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+from tqdm import tqdm
+
+from lighteval.data import GenerativeTaskDataset, LoglikelihoodDataset
+from lighteval.models.abstract_model import LightevalModel, ModelInfo
+from lighteval.models.model_input import GenerationParameters
+from lighteval.models.model_output import (
+    GenerativeResponse,
+    LoglikelihoodResponse,
+)
+from lighteval.models.utils import _get_dtype, _simplify_name
+from lighteval.tasks.requests import (
+    GreedyUntilRequest,
+    LoglikelihoodRequest,
+)
+from lighteval.utils.imports import is_sglang_available
+from lighteval.utils.utils import EnvConfig, as_list
+
+
+logger = logging.getLogger(__name__)
+
+from more_itertools import distribute
+from sglang import Engine
+from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.lang.ir import SglSamplingParams
+from sglang.srt.distributed.parallel_state import destroy_model_parallel, destroy_distributed_environment
+
+# from vllm.distributed.parallel_state import destroy_distributed_environment, destroy_model_parallel
+
+logging.getLogger("sglang").propagate = True
+logging.getLogger("sglang").handlers.clear()
+
+## Jayon02: sglang with what dependency, ray? flashinfer?
+# if is_sglang_available():
+#     from more_itertools import distribute
+#     from vllm import LLM, SamplingParams
+#     from vllm.distributed.parallel_state import destroy_distributed_environment, destroy_model_parallel
+#     from vllm.transformers_utils.tokenizer import get_tokenizer
+#
+#     logging.getLogger("sglang").propagate = True
+#     logging.getLogger("sglang").handlers.clear()
+# else:
+#     LLM = None
+#     SamplingParams = None
+#     get_tokenizer = None
+#     distribute = None
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+STARTING_BATCH_SIZE = 512
+
+## change to all sglang config
+@dataclass
+class SGLANGModelConfig:
+    pretrained: str #
+    trust_remote_code: bool = True #
+    dtype: str = "auto" #
+    tensor_parallel_size: int = 1  # how many GPUs to use for tensor parallelism
+    device: str = "cuda"
+    disable_radix_cache: bool = True
+    seed: int = 42 #
+    disable_cuda_graph: bool = True
+    disable_cuda_graph_padding: bool = True
+    max_model_length: int | None = None  # maximum length of the model, ussually infered automatically. reduce this if you encouter OOM issues, 4096 is usually enough
+    return_token_ids: bool = True
+
+    gpu_memory_utilisation: float = 0.9  # lower this if you are running out of memory
+    revision: str = "main"  # revision of the model
+    pipeline_parallel_size: int = 1  # how many GPUs to use for pipeline parallelism
+    data_parallel_size: int = 1  # how many GPUs to use for data parallelism
+    swap_space: int = 4  # CPU swap space size (GiB) per GPU.
+    use_chat_template: bool = False
+    add_special_tokens: bool = True
+    multichoice_continuations_start_space: bool = (
+        True  # whether to add a space at the start of each continuation in multichoice generation
+    )
+    pairwise_tokenization: bool = False  # whether to tokenize the context and continuation separately or together.
+    generation_parameters: GenerationParameters = None  # sampling parameters to use for generation
+
+    subfolder: Optional[str] = None
+
+    def __post_init__(self):
+        if not self.generation_parameters:
+            self.generation_parameters = GenerationParameters()
+
+
+class SGLANGModel(LightevalModel):
+    def __init__(
+        self,
+        config: SGLANGModelConfig,
+        env_config: EnvConfig,
+    ):
+        """Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation."""
+        self._config = config
+        self.use_chat_template = config.use_chat_template
+        self.data_parallel_size = int(config.data_parallel_size)
+        self.tensor_parallel_size = int(config.tensor_parallel_size)
+
+        self._add_special_tokens = config.add_special_tokens if config.add_special_tokens is not None else False
+        self._tokenizer = self._create_auto_tokenizer(config, env_config)
+
+        self._max_length = int(config.max_model_length) if config.max_model_length is not None else None
+
+        # If model_parallel is not set we compare the number of processes with the number of GPUs
+        self.model = self._create_auto_model(config, env_config)
+
+        # self._device = config.accelerator.device if config.accelerator is not None else "cpu"
+        self.multichoice_continuations_start_space = config.multichoice_continuations_start_space
+
+        self.model_name = _simplify_name(config.pretrained)
+        self.model_sha = ""  # config.get_model_sha()
+        self.precision = _get_dtype(config.dtype, config=self._config)
+
+        self.model_info = ModelInfo(model_name=self.model_name, model_sha=self.model_sha)
+        self.sampling_params = SglSamplingParams(**config.generation_parameters.to_sglang_dict())
+        self.pairwise_tokenization = config.pairwise_tokenization
+
+    @property
+    def tokenizer(self):
+        return self._tokenizer
+
+    def cleanup(self):
+        destroy_model_parallel()
+        if self.model is not None:
+            del self.model.llm_engine.model_executor.driver_worker
+        self.model = None
+        gc.collect()
+        # TODO: check sglang dependency: ray flashinfer ray?
+        # ray.shutdown()
+        destroy_distributed_environment()
+        torch.cuda.empty_cache()
+
+    @property
+    def add_special_tokens(self):
+        return self._add_special_tokens
+
+    @property
+    def max_length(self) -> int:
+        return self._max_length
+
+    def _create_auto_model(self, config: SGLANGModelConfig, env_config: EnvConfig) -> Optional[Engine]:
+        """
+        Creates an instance of the pretrained HF model.
+
+        Args:
+            pretrained (str): The name or path of the pretrained model.
+            revision (str): The revision of the model.
+            subfolder (Optional[str], optional): The subfolder within the model. Defaults to None.
+            max_memory (Optional[dict], optional): The maximum memory to allocate for the model per GPU. Defaults to None.
+            device_map (Optional[dict], optional): The device mapping for the model. Defaults to None.
+            torch_dtype (Optional[Union[str, torch.dtype]], optional): The torch data type for the model. Defaults to None.
+            quantization_config (Optional[Union[BitsAndBytesConfig, GPTQConfig]], optional): The quantization configuration for the model. Defaults to None.
+            trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False.
+            cache_dir (str, optional): The cache directory for the model. Defaults to "/scratch".
+
+        Returns:
+            transformers.PreTrainedModel: The created auto model instance.
+        """
+        # self.model_args = {
+        #     "model": config.pretrained,
+        #     "gpu_memory_utilization": float(config.gpu_memory_utilisation),
+        #     "revision": config.revision + (f"/{config.subfolder}" if config.subfolder is not None else ""),
+        #     "dtype": config.dtype,
+        #     "trust_remote_code": config.trust_remote_code,
+        #     "tensor_parallel_size": int(config.tensor_parallel_size),
+        #     "pipeline_parallel_size": int(config.pipeline_parallel_size),
+        #     "max_model_len": self._max_length,
+        #     "swap_space": 4,
+        #     "seed": 1234,
+        # }
+
+        # TODO: double check
+        self.model_args  = {
+            "model_path": config.pretrained,
+            "trust_remote_code": config.trust_remote_code,
+            "dtype": config.dtype,
+            "tp_size": int(config.tensor_parallel_size),
+            "device": "cuda",
+            "disable_radix_cache": config.disable_radix_cache,
+            "random_seed": config.seed,
+            "disable_cuda_graph": config.disable_cuda_graph,
+            "disable_cuda_graph_padding": config.disable_cuda_graph_padding,
+            "context_length": self._max_length,
+            "log_level": "info",
+            "return_token_ids": True,
+
+            "revision": config.revision + (f"/{config.subfolder}" if config.subfolder is not None else ""),
+        }
+
+        # TODO: double check
+        # if int(config.data_parallel_size) > 1:
+        #     self.model_args["distributed_executor_backend"] = "ray"
+        #     self._batch_size = "auto"
+        #     return None
+
+        model = Engine(**self.model_args)
+
+        # TODO: double check
+        # If the max_length can't get extracted from the config, it will be inferred from the model
+        # Inferring from the tokenizer will cause vllm to bug for models with mismatches between model
+        # config and tk config, like mistralai/Mistral-7B-v0.1
+        # if self._max_length is None:
+        #    self._max_length = model.llm_engine.model_config.max_seq_len_to_capture
+
+        return model
+
+    def _create_auto_tokenizer(self, config: SGLANGModelConfig, env_config: EnvConfig):
+        tokenizer = get_tokenizer(
+            config.pretrained,
+            tokenizer_mode="auto",
+            trust_remote_code=config.trust_remote_code,
+            tokenizer_revision=config.revision,
+        )
+        tokenizer.pad_token = tokenizer.eos_token
+        return tokenizer
+
+    def greedy_until(
+        self,
+        requests: list[GreedyUntilRequest],
+        override_bs: Optional[int] = None,
+    ) -> list[GenerativeResponse]:
+        """
+        Generates responses using a greedy decoding strategy until certain ending conditions are met.
+
+        Args:
+            requests (list[Request]): list of requests containing the context and ending conditions.
+            override_bs (int, optional): Override the batch size for generation. Defaults to None.
+
+        Returns:
+            list[GenerateReturn]: list of generated responses.
+        """
+        for request in requests:
+            request.stop_sequence = as_list(request.stop_sequence) + [self.tokenizer.eos_token]
+            request.tokenized_context = self.tok_encode(request.context)
+
+        dataset = GenerativeTaskDataset(requests=requests, num_dataset_splits=self.DATASET_SPLITS)
+        results = []
+
+        for _ in tqdm(
+            dataset.splits_start_end_iterator(),
+            total=dataset.num_dataset_splits,
+            desc="Splits",
+            position=0,
+            disable=False,  # self.disable_tqdm,
+        ):
+            # For chat models, generation stops with EOS token, so we don't need to specify stop tokens
+            if self.use_chat_template:
+                stop_tokens = []
+            else:
+                # NOTE: we are assuming all items in a batch behave similarly (same
+                # stop_tokens and max_tokens genrated) which is not necessarily
+                # the case! Because of that we only use batch size of 1
+                stop_tokens = dataset[0].stop_sequence
+
+            max_new_tokens = dataset[0].generation_size  # could be none
+            returns_logits = dataset[0].use_logits
+            num_samples = dataset[0].num_samples
+
+            context = [c.context for c in dataset]
+            tokenized = self.tokenizer(context, add_special_tokens=self.add_special_tokens)
+
+            # The main question for this step is the following:
+            # Would we rather truncate the prompt to allow generation to go to max_new_tokens, at the risk
+            # of losing some meaning, or have some generations that are exceedingly short?
+            # The choice we go for here is to avoid truncating the prompt if we can, since it
+            # should have been managed by the prompt creator/few shot manager if requested by the user.
+            inputs = tokenized["input_ids"]
+            context_size = len(inputs[0])
+
+            # left truncate the inputs to the maximum length
+            if max_new_tokens is not None:
+                if context_size + max_new_tokens > self.max_length:
+                    logger.warning(
+                        f"{context_size + max_new_tokens=} which is greather than {self.max_length=}. Truncating context to {self.max_length - max_new_tokens} tokens."
+                    )
+                    context_size = self.max_length - max_new_tokens
+                    inputs = [input[-context_size:] for input in inputs]
+            else:
+                if context_size > self.max_length:
+                    logger.warning(
+                        f"{context_size=} which is greather than {self.max_length=}. Truncating context to {self.max_length} tokens."
+                    )
+                    context_size = self.max_length
+                    inputs = [input[-context_size:] for input in inputs]
+
+            sglang_outputs = self._generate(
+                inputs=inputs,
+                max_new_tokens=max_new_tokens,
+                stop_tokens=stop_tokens,
+                returns_logits=returns_logits,
+                num_samples=num_samples,
+            )
+
+            for sglang_output in sglang_outputs:
+                output_token_ids = [outputs.token_ids for outputs in sglang_output.outputs]
+                logprobs = [output.logprobs for output in sglang_output.outputs] or []
+                logprobs = [logprob[token_id].logprob for token_id, logprob in zip(output_token_ids[0], logprobs[0])]
+                result = [output.text for output in sglang_output.outputs]
+                input_token_ids = sglang_output.prompt_token_ids
+
+                cur_response = GenerativeResponse(
+                    result=result,
+                    logits=logprobs,
+                    generated_tokens=list(output_token_ids),
+                    input_tokens=input_token_ids,
+                )
+                results.append(cur_response)
+
+        return dataset.get_original_order(results)
+
+    def _generate(
+        self,
+        inputs: list[list[int]],
+        max_new_tokens: Optional[int] = None,
+        stop_tokens: Optional[list[str]] = None,
+        returns_logits: Optional[bool] = False,
+        num_samples: int = 1,
+        generate: bool = True,
+    ) -> list[GenerativeResponse]:
+        """Contains the actual logic of the generation."""
+        sampling_params = self.sampling_params.clone() or SglSamplingParams()
+        if generate:
+            sampling_params.n = num_samples
+            sampling_params.max_tokens = max_new_tokens
+            sampling_params.stop = stop_tokens
+            sampling_params.logprobs = 1 if returns_logits else 0
+
+        else:
+            sampling_params.temperature = 0
+            sampling_params.prompt_logprobs = 1
+            sampling_params.max_tokens = 1
+            sampling_params.detokenize = False
+
+        ## Jayon02: how do sglang handle this
+        # if self.data_parallel_size > 1:
+        #     # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
+        #     # also seems to only work with decorator and not with ray.remote() fn
+        #     # see https://github.com/vllm-project/vllm/issues/973
+        #     # note: this has changed on 0.3.3, and it only works now if num_gpus are set.
+        #     # but then tensor_parallel breaks
+        #     # Hynek: With the newest vllm, it actually breaks when tensor_parallel_size == 1 and num_gpus not set,
+        #     # as VLLM complains about no GPUs available.
+        #     @ray.remote(num_gpus=1 if self.tensor_parallel_size == 1 else None)
+        #     def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, requests):
+        #         llm = LLM(**model_args)
+        #         return llm.generate(prompt_token_ids=requests, sampling_params=sampling_params)
+        #
+        #     # dispatch requests to all self.data_parallel_size workers, in interleaved fashion
+        #     # interleaved important to balance context lengths across workers
+        #     requests = [list(x) for x in distribute(self.data_parallel_size, inputs)]
+        #     inputs = ((self.model_args, sampling_params, req) for req in requests)
+        #     object_refs = [run_inference_one_model.remote(*x) for x in inputs]
+        #     results = ray.get(object_refs)
+        #     # Invoke ray.shutdown() to prevent hang-ups if subsequent calls required.
+        #     ray.shutdown()
+        #     # flatten results
+        #     outputs = [
+        #         x
+        #         for x in itertools.chain.from_iterable(itertools.zip_longest(*[list(x) for x in results]))
+        #         if x is not None
+        #     ]
+        # else:
+        #     outputs = self.model.generate(
+        #         prompt_token_ids=inputs,
+        #         sampling_params=sampling_params,
+        #         use_tqdm=True,
+        #     )
+
+        outputs = self.model.generate(
+                prompt_token_ids=inputs,
+                sampling_params=sampling_params,
+                use_tqdm=True,
+            )
+
+        return outputs
+
+    def loglikelihood(
+        self, requests: list[LoglikelihoodRequest], override_bs: Optional[int] = None
+    ) -> list[LoglikelihoodResponse]:
+        for request in requests:
+            if request.context == "":
+                request.tokenized_context = [self.tokenizer.eos_token_id]
+                request.tokenized_continuation = self.tok_encode(request.choice)
+            else:
+                # The following line is mandatory for compatibility with the harness
+                request.tokenized_context, request.tokenized_continuation = self.tok_encode_pair(
+                    request.context, request.choice, pairwise=self.pairwise_tokenization
+                )
+        return self._loglikelihood_tokens(requests, override_bs=override_bs)
+
+    def _loglikelihood_tokens(
+        self,
+        requests: list[LoglikelihoodRequest],
+        override_bs: int = -1,
+        return_bool_score: bool = True,
+        rolling: bool = False,
+    ) -> list[LoglikelihoodResponse]:
+        dataset = LoglikelihoodDataset(requests=requests, num_dataset_splits=1)
+        res = []
+
+        for _ in tqdm(dataset.splits_start_end_iterator()):
+            # the last token is an eos token, so we don't need to add it
+            inputs = [dataset[i].tokenized_context + dataset[i].tokenized_continuation for i in range(len(dataset))]
+            # Left truncate the inputs to the maximum length
+            inputs = [input[-self.max_length :] for input in inputs]
+            outputs = self._generate(inputs, generate=False)
+
+            for output, input in zip(outputs, dataset):
+                continuation_logprobs = []
+                for token, logprobs in zip(input.tokenized_continuation[::-1], output.prompt_logprobs[::-1]):
+                    continuation_logprobs.append(logprobs[token])
+                bool_score = all(logprob.rank == 1 for logprob in continuation_logprobs)
+                continuation_logprobs = [logprob.logprob for logprob in continuation_logprobs]
+                answer = LoglikelihoodResponse(
+                    input_tokens=input.tokenized_context + input.tokenized_continuation,
+                    generated_tokens=input.tokenized_continuation,
+                    result=(sum(continuation_logprobs), bool_score if return_bool_score else None),
+                )
+                res.append(answer)
+
+        return dataset.get_original_order(res)
+
+    def loglikelihood_rolling():
+        pass
+
+    def loglikelihood_single_token():
+        pass
diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py
index 0bb13a6f1..9d666b4a5 100644
--- a/src/lighteval/models/vllm/vllm_model.py
+++ b/src/lighteval/models/vllm/vllm_model.py
@@ -48,7 +48,7 @@
 
 logger = logging.getLogger(__name__)
 
-
+## Jayon02: sglang with what dependency, ray? flashinfer?
 if is_vllm_available():
     import ray
     from more_itertools import distribute
@@ -72,7 +72,7 @@
 
 STARTING_BATCH_SIZE = 512
 
-
+## change to all sglang config
 @dataclass
 class VLLMModelConfig:
     pretrained: str
@@ -326,6 +326,7 @@ def _generate(
             sampling_params.max_tokens = 1
             sampling_params.detokenize = False
 
+        ## Jayon02: how do sglang handle this
         if self.data_parallel_size > 1:
             # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
             # also seems to only work with decorator and not with ray.remote() fn
diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py
index 0e6282ef5..148c1849e 100644
--- a/src/lighteval/pipeline.py
+++ b/src/lighteval/pipeline.py
@@ -57,7 +57,7 @@
     is_nanotron_available,
     is_openai_available,
     is_tgi_available,
-    is_vllm_available,
+    is_vllm_available, is_sglang_available, NO_SGLANG_ERROR_MSG,
 )
 from lighteval.utils.parallelism import test_all_gather
 from lighteval.utils.utils import EnvConfig, make_results_table
@@ -86,6 +86,7 @@ class ParallelismManager(Enum):
     OPENAI = auto()
     VLLM = auto()
     NONE = auto()
+    SGLANG = auto()
 
 
 @dataclass
@@ -113,6 +114,9 @@ def __post_init__(self):  # noqa C901
         elif self.launcher_type == ParallelismManager.VLLM:
             if not is_vllm_available():
                 raise ImportError(NO_VLLM_ERROR_MSG)
+        elif self.launcher_type == ParallelismManager.SGLANG:
+            if not is_sglang_available():
+                raise ImportError(NO_SGLANG_ERROR_MSG)
         elif self.launcher_type == ParallelismManager.TGI:
             if not is_tgi_available():
                 raise ImportError(NO_TGI_ERROR_MSG)
@@ -189,7 +193,9 @@ def _init_model(self, model_config, model):
                     env_config=self.pipeline_parameters.env_config,
                 )
             else:
+                ## Jayon02: load model into vllm
                 return load_model(config=model_config, env_config=self.pipeline_parameters.env_config)
+
         if isinstance(model, TransformersModel):
             return model
         else:
diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py
index 174a98d33..71a8c2bb1 100644
--- a/src/lighteval/tasks/registry.py
+++ b/src/lighteval/tasks/registry.py
@@ -299,6 +299,7 @@ def taskinfo_selector(tasks: str, task_registry: Registry) -> tuple[list[str], d
         expanded_tasks = task_registry.task_groups_dict.get(maybe_task_group, [maybe_task_group])
         expanded_tasks_list.extend(expanded_tasks)
 
+    ## task expand and few shot number record, not load task
     for task in expanded_tasks_list:
         try:
             suite_name, task_name, few_shot, truncate_few_shots = tuple(task.split("|"))
diff --git a/src/lighteval/utils/imports.py b/src/lighteval/utils/imports.py
index 9b92adcee..14e3b94da 100644
--- a/src/lighteval/utils/imports.py
+++ b/src/lighteval/utils/imports.py
@@ -87,9 +87,13 @@ def is_litellm_available() -> bool:
 def is_vllm_available() -> bool:
     return importlib.util.find_spec("vllm") is not None and importlib.util.find_spec("ray") is not None
 
-
 NO_VLLM_ERROR_MSG = "You are trying to use an VLLM model, for which you need `vllm` and `ray`, which are not available in your environment. Please install them using pip, `pip install vllm ray`."
 
+# TODO： need review
+def is_sglang_available() -> bool:
+    return importlib.util.find_spec("sglang") is not None and importlib.util.find_spec("flashinfer") is not None
+
+NO_SGLANG_ERROR_MSG = "You are trying to use an sglang model, for which you need `sglang` and `flashinfer`, which are not available in your environment. Please install them using pip, `pip install vllm ray`."
 
 def can_load_extended_tasks() -> bool:
     imports = []

From 22dc29b7effbf9876848ecaad96a3a1c983cc0ac Mon Sep 17 00:00:00 2001
From: Jayon02 <12012211@mail.sustech.edu.cn>
Date: Sat, 8 Feb 2025 10:20:29 +0800
Subject: [PATCH 02/10] output bugs

---
 sglang_inputs_token.txt                     | 16 ++++
 sglang_output.txt                           | 11 +++
 sglang_sampling_para.txt                    |  1 +
 src/lighteval/models/model_input.py         | 16 +++-
 src/lighteval/models/sglang/sglang_model.py | 85 +++++++++++++--------
 src/lighteval/models/vllm/vllm_model.py     |  1 -
 vllm_inputs_token.txt                       | 16 ++++
 vllm_output.txt                             |  1 +
 vllm_sampling_para.txt                      |  1 +
 9 files changed, 112 insertions(+), 36 deletions(-)
 create mode 100644 sglang_inputs_token.txt
 create mode 100644 sglang_output.txt
 create mode 100644 sglang_sampling_para.txt
 create mode 100644 vllm_inputs_token.txt
 create mode 100644 vllm_output.txt
 create mode 100644 vllm_sampling_para.txt

diff --git a/sglang_inputs_token.txt b/sglang_inputs_token.txt
new file mode 100644
index 000000000..76bbf4434
--- /dev/null
+++ b/sglang_inputs_token.txt
@@ -0,0 +1,16 @@
+1,8789,13,1889,1984,15503,2472,5888,13,12,1270,1848,1783,28732,944,28725,2195,1148,1329,13,12,12,944,28723,8554,1148,327,2195,1148,13,1889,1984,3540,2472,28732,5183,15503,2472,1329,13,12,1270,1848,1783,6743,944,28725,1083,1148,1329,13,12,12,944,28723,1666,1148,327,1083,1148,13,28744,327,1984,15503,2472,28732,28782,28731,13,28724,327,1984,3540,2472,28732,28744,28723,8554,1148,28731,13,13940,28832,13,3195,349,272,1192,302,337,28723,1666,1148,438,272,948,302,456,2007,28804
+1,8789,13,28715,327,371,28740,28747,464,21558,647,28705,28750,28747,464,17664,647,28705,28770,28747,464,12684,3970,14491,13,1042,28730,327,9842,13,1392,613,297,2819,28732,2004,28732,28715,24770,13,12,335,613,1239,28705,28750,859,28705,28740,28747,13,12,12,1042,28730,2679,281,28792,28710,28793,13,13940,28832,13,3195,349,272,1192,302,1117,28730,1024,456,2007,4546,274,28804
+1,8789,13,1270,408,28732,28711,1329,13,12,335,307,859,28705,28740,28747,13,12,12,807,28705,28740,13,12,2013,28747,13,12,12,807,307,648,408,28732,28711,28733,28740,28731,13,28764,327,408,28732,28740,28734,28731,13,28724,327,408,6422,28740,28734,28731,13,13940,28832,13,3195,1212,302,5851,1235,456,2007,7072,28804
+1,8789,13,1889,1984,2472,5888,13,12,1270,1848,1783,6743,944,28725,2095,1329,13,12,12,944,28723,1148,327,2095,13,28744,327,1984,2472,28732,28782,28731,13,28724,327,1984,2472,28732,28744,28723,1148,28731,13,13940,28832,13,3195,349,272,1192,302,337,28723,1148,438,272,948,302,456,2007,28804
+1,8789,13,1889,1984,2472,5888,13,12,1270,1848,1783,6743,944,28725,2095,1329,13,12,12,944,28723,1148,327,2095,13,28744,327,1984,2472,28732,28782,28731,13,28724,327,1984,2472,28732,28744,28723,1148,28731,13,13940,28832,13,3195,349,272,1192,302,1318,28723,1148,438,272,948,302,456,2007,28804
+1,8789,13,1270,1369,28732,28744,1329,13,12,1392,613,297,2819,28732,28750,28725,1318,348,28732,28734,28723,28782,24770,13,12,12,335,1318,1239,716,28732,28710,28731,859,28705,28734,28747,13,12,12,12,807,8250,13,12,807,6110,13,13940,28832,13,3195,349,1369,28732,28740,28734,28740,11840
+1,8789,13,1270,285,28740,5888,13,12,807,1117,28732,28744,28731,648,464,21558,28742,13,1270,285,28750,5888,13,12,807,285,28740,28732,28750,28736,28744,28731,13,28744,327,285,28750,28732,28782,28750,28781,28731,13,13940,28832,13,3195,349,272,1192,302,1318,438,272,948,302,456,2007,28804
+1,8789,13,1270,285,28740,28732,28744,1329,13,12,28744,28792,28734,28793,2679,28705,28740,13,28724,327,285,28740,28732,28740,28731,13,28764,327,285,28750,5187,28740,2803,13,28712,327,285,28740,857,28775,1481,13,13940,28832,13,3195,1212,302,5851,1235,456,2007,7072,28804
+1,8789,13,1889,1984,2472,5888,13,12,1270,1848,1783,6743,944,28725,2095,1329,13,12,12,944,28723,1148,327,2095,13,28744,327,1984,2472,28732,28782,28731,13,28724,327,1984,2472,28732,28744,28723,1148,28731,13,13940,28832,13,3195,1212,302,1928,349,1318,28804
+1,8789,13,1801,327,28705,28734,13,1392,613,297,2819,28732,28740,28734,28734,28725,28705,28734,28725,387,28750,1329,13,12,1801,2679,613,13,13940,28832,13,3195,349,272,1192,302,1024,272,4008,727,1407,28705,28770,349,15731,28804
+1,8789,13,1270,18328,28732,28744,1329,13,12,807,1318,28736,28744,13,28724,327,18328,28732,28782,28731,13,28744,327,18328,28732,28724,28731,13,13940,28832,13,3195,349,272,1192,302,337,438,272,948,302,456,2007,28804
+1,8789,13,28744,327,28705,28782,13,28724,327,28705,28770,13,28764,327,28705,28787,13,28744,327,337,648,1318,13,13940,28832,13,3195,349,272,1192,302,1318,438,1407,28705,28750,28804
+1,8789,13,28744,327,28705,28740,13,335,1318,876,28705,28770,28747,13,12,28724,327,464,21558,28742,13,13940,28832,13,3195,349,272,1192,302,337,438,1407,28705,28781,28804
+1,8789,13,7841,25637,390,7494,13,28744,327,7494,28723,3506,385,28732,28740,28734,28731,13,13940,28832,13,3195,349,272,1192,302,1318,438,272,948,302,456,2007,28804
+1,8789,13,28724,327,1274,28732,2022,28732,501,28725,5936,28740,647,464,21558,647,464,28750,16433,13,13940,28832,13,3195,2118,1235,456,2007,7072,28804
+1,8789,13,335,1318,523,28705,28782,28747,13,12,4119,13,13940,28832,13,3195,2118,1235,456,2007,5439,28804
diff --git a/sglang_output.txt b/sglang_output.txt
new file mode 100644
index 000000000..e546c060d
--- /dev/null
+++ b/sglang_output.txt
@@ -0,0 +1,11 @@
+│ │                  │   {                                                                       │ │
+│ │                  │   │   'text': ' Answer according to: For the given dictionary d with keys │ │
+│ │                  being integers, trave'+208,                                                 │ │
+│ │                  │   │   'meta_info': {                                                      │ │
+│ │                  │   │   │   'id': '04224678f3a24f7abfafef26cd6d101f',                       │ │
+│ │                  │   │   │   'finish_reason': {'type': 'stop', 'matched': '\n'},             │ │
+│ │                  │   │   │   'prompt_tokens': 77,                                            │ │
+│ │                  │   │   │   'completion_tokens': 58,                                        │ │
+│ │                  │   │   │   'cached_tokens': 0                                              │ │
+│ │                  │   │   }                                                                   │ │
+│ │                  │   }, 
\ No newline at end of file
diff --git a/sglang_sampling_para.txt b/sglang_sampling_para.txt
new file mode 100644
index 000000000..e4813cdca
--- /dev/null
+++ b/sglang_sampling_para.txt
@@ -0,0 +1 @@
+{'top_p': 1.0, 'top_k': -1, 'max_new_tokens': 100, 'stop': ['\n', '</s>'], 'temperature': 1.0, 'repetition_penalty': 1.0, 'skip_special_tokens': True, 'spaces_between_special_tokens': True}
\ No newline at end of file
diff --git a/src/lighteval/models/model_input.py b/src/lighteval/models/model_input.py
index 687a561a3..bedc12b8d 100644
--- a/src/lighteval/models/model_input.py
+++ b/src/lighteval/models/model_input.py
@@ -120,7 +120,17 @@ def to_tgi_ie_dict(self) -> dict:
         }
         return {k: v for k, v in args.items() if v is not None}
 
-    # TODO first: sampling parameter
-    def to_sglang_dict(self) -> dict:
-        return {k: v for k, v in asdict(self).items() if v is not None}
+    # # TODO first: sampling parameter
+    # def to_sglang_dict(self) -> dict:
+    #     args = {
+    #         "max_new_tokens": self.max_new_tokens,
+    #         "min_new_tokens": self.min_new_tokens,
+    #         "stop_token_ids": self.stop_tokens,
+    #         "temperature": self.temperature,
+    #         "top_k": self.top_k,
+    #         "top_p": self.top_p,
+    #         "min_p": self.min_p,
+    #         "repetition_penalty": self.repetition_penalty,
+    #     }
+    #     return {k: v for k, v in args.items() if v is not None}
 
diff --git a/src/lighteval/models/sglang/sglang_model.py b/src/lighteval/models/sglang/sglang_model.py
index 6b81d2a8f..b73d7bf67 100644
--- a/src/lighteval/models/sglang/sglang_model.py
+++ b/src/lighteval/models/sglang/sglang_model.py
@@ -51,7 +51,7 @@
 from more_itertools import distribute
 from sglang import Engine
 from sglang.srt.hf_transformers_utils import get_tokenizer
-from sglang.lang.ir import SglSamplingParams
+from sglang.srt.sampling.sampling_params import SamplingParams
 from sglang.srt.distributed.parallel_state import destroy_model_parallel, destroy_distributed_environment
 
 # from vllm.distributed.parallel_state import destroy_distributed_environment, destroy_model_parallel
@@ -141,7 +141,8 @@ def __init__(
         self.precision = _get_dtype(config.dtype, config=self._config)
 
         self.model_info = ModelInfo(model_name=self.model_name, model_sha=self.model_sha)
-        self.sampling_params = SglSamplingParams(**config.generation_parameters.to_sglang_dict())
+        # self.sampling_params = SamplingParams(**config.generation_parameters.to_sglang_dict())
+        self.sampling_params = dict()
         self.pairwise_tokenization = config.pairwise_tokenization
 
     @property
@@ -211,7 +212,7 @@ def _create_auto_model(self, config: SGLANGModelConfig, env_config: EnvConfig) -
             "disable_cuda_graph_padding": config.disable_cuda_graph_padding,
             "context_length": self._max_length,
             "log_level": "info",
-            "return_token_ids": True,
+            # "return_token_ids": True,
 
             "revision": config.revision + (f"/{config.subfolder}" if config.subfolder is not None else ""),
         }
@@ -297,20 +298,20 @@ def greedy_until(
             context_size = len(inputs[0])
 
             # left truncate the inputs to the maximum length
-            if max_new_tokens is not None:
-                if context_size + max_new_tokens > self.max_length:
-                    logger.warning(
-                        f"{context_size + max_new_tokens=} which is greather than {self.max_length=}. Truncating context to {self.max_length - max_new_tokens} tokens."
-                    )
-                    context_size = self.max_length - max_new_tokens
-                    inputs = [input[-context_size:] for input in inputs]
-            else:
-                if context_size > self.max_length:
-                    logger.warning(
-                        f"{context_size=} which is greather than {self.max_length=}. Truncating context to {self.max_length} tokens."
-                    )
-                    context_size = self.max_length
-                    inputs = [input[-context_size:] for input in inputs]
+            # if max_new_tokens is not None:
+            #     if context_size + max_new_tokens > self.max_length:
+            #         logger.warning(
+            #             f"{context_size + max_new_tokens=} which is greather than {self.max_length=}. Truncating context to {self.max_length - max_new_tokens} tokens."
+            #         )
+            #         context_size = self.max_length - max_new_tokens
+            #         inputs = [input[-context_size:] for input in inputs]
+            # else:
+            #     if context_size > self.max_length:
+            #         logger.warning(
+            #             f"{context_size=} which is greather than {self.max_length=}. Truncating context to {self.max_length} tokens."
+            #         )
+            #         context_size = self.max_length
+            #         inputs = [input[-context_size:] for input in inputs]
 
             sglang_outputs = self._generate(
                 inputs=inputs,
@@ -347,18 +348,31 @@ def _generate(
         generate: bool = True,
     ) -> list[GenerativeResponse]:
         """Contains the actual logic of the generation."""
-        sampling_params = self.sampling_params.clone() or SglSamplingParams()
-        if generate:
-            sampling_params.n = num_samples
-            sampling_params.max_tokens = max_new_tokens
-            sampling_params.stop = stop_tokens
-            sampling_params.logprobs = 1 if returns_logits else 0
-
-        else:
-            sampling_params.temperature = 0
-            sampling_params.prompt_logprobs = 1
-            sampling_params.max_tokens = 1
-            sampling_params.detokenize = False
+        # TODO: double check without clone
+        # bug: params are wrong
+        # sampling_params = self.sampling_params
+        # if generate:
+        #     sampling_params.n = num_samples
+        #     sampling_params.max_tokens = max_new_tokens
+        #     sampling_params.stop = stop_tokens
+        #     sampling_params.logprobs = 1 if returns_logits else 0
+
+        # else:
+        #     sampling_params.temperature = 0
+        #     sampling_params.prompt_logprobs = 1
+        #     sampling_params.max_tokens = 1
+        #     sampling_params.detokenize = False
+        
+        params = dict(
+            top_p=1.0,
+            top_k=-1,
+            max_new_tokens=max_new_tokens,
+            stop=stop_tokens,
+            temperature=1.0,
+            repetition_penalty=1.0,
+            skip_special_tokens=True,
+            spaces_between_special_tokens=True
+        )
 
         ## Jayon02: how do sglang handle this
         # if self.data_parallel_size > 1:
@@ -395,12 +409,19 @@ def _generate(
         #         use_tqdm=True,
         #     )
 
+        # print(params)
+        # exit(0)
+
         outputs = self.model.generate(
-                prompt_token_ids=inputs,
-                sampling_params=sampling_params,
-                use_tqdm=True,
+                input_ids=inputs,
+                sampling_params=params,
             )
 
+        # outputs = self.model.generate(
+        #         inputs,
+        #         params,
+        #     )
+
         return outputs
 
     def loglikelihood(
diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py
index 9d666b4a5..b86850a6c 100644
--- a/src/lighteval/models/vllm/vllm_model.py
+++ b/src/lighteval/models/vllm/vllm_model.py
@@ -326,7 +326,6 @@ def _generate(
             sampling_params.max_tokens = 1
             sampling_params.detokenize = False
 
-        ## Jayon02: how do sglang handle this
         if self.data_parallel_size > 1:
             # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
             # also seems to only work with decorator and not with ray.remote() fn
diff --git a/vllm_inputs_token.txt b/vllm_inputs_token.txt
new file mode 100644
index 000000000..76bbf4434
--- /dev/null
+++ b/vllm_inputs_token.txt
@@ -0,0 +1,16 @@
+1,8789,13,1889,1984,15503,2472,5888,13,12,1270,1848,1783,28732,944,28725,2195,1148,1329,13,12,12,944,28723,8554,1148,327,2195,1148,13,1889,1984,3540,2472,28732,5183,15503,2472,1329,13,12,1270,1848,1783,6743,944,28725,1083,1148,1329,13,12,12,944,28723,1666,1148,327,1083,1148,13,28744,327,1984,15503,2472,28732,28782,28731,13,28724,327,1984,3540,2472,28732,28744,28723,8554,1148,28731,13,13940,28832,13,3195,349,272,1192,302,337,28723,1666,1148,438,272,948,302,456,2007,28804
+1,8789,13,28715,327,371,28740,28747,464,21558,647,28705,28750,28747,464,17664,647,28705,28770,28747,464,12684,3970,14491,13,1042,28730,327,9842,13,1392,613,297,2819,28732,2004,28732,28715,24770,13,12,335,613,1239,28705,28750,859,28705,28740,28747,13,12,12,1042,28730,2679,281,28792,28710,28793,13,13940,28832,13,3195,349,272,1192,302,1117,28730,1024,456,2007,4546,274,28804
+1,8789,13,1270,408,28732,28711,1329,13,12,335,307,859,28705,28740,28747,13,12,12,807,28705,28740,13,12,2013,28747,13,12,12,807,307,648,408,28732,28711,28733,28740,28731,13,28764,327,408,28732,28740,28734,28731,13,28724,327,408,6422,28740,28734,28731,13,13940,28832,13,3195,1212,302,5851,1235,456,2007,7072,28804
+1,8789,13,1889,1984,2472,5888,13,12,1270,1848,1783,6743,944,28725,2095,1329,13,12,12,944,28723,1148,327,2095,13,28744,327,1984,2472,28732,28782,28731,13,28724,327,1984,2472,28732,28744,28723,1148,28731,13,13940,28832,13,3195,349,272,1192,302,337,28723,1148,438,272,948,302,456,2007,28804
+1,8789,13,1889,1984,2472,5888,13,12,1270,1848,1783,6743,944,28725,2095,1329,13,12,12,944,28723,1148,327,2095,13,28744,327,1984,2472,28732,28782,28731,13,28724,327,1984,2472,28732,28744,28723,1148,28731,13,13940,28832,13,3195,349,272,1192,302,1318,28723,1148,438,272,948,302,456,2007,28804
+1,8789,13,1270,1369,28732,28744,1329,13,12,1392,613,297,2819,28732,28750,28725,1318,348,28732,28734,28723,28782,24770,13,12,12,335,1318,1239,716,28732,28710,28731,859,28705,28734,28747,13,12,12,12,807,8250,13,12,807,6110,13,13940,28832,13,3195,349,1369,28732,28740,28734,28740,11840
+1,8789,13,1270,285,28740,5888,13,12,807,1117,28732,28744,28731,648,464,21558,28742,13,1270,285,28750,5888,13,12,807,285,28740,28732,28750,28736,28744,28731,13,28744,327,285,28750,28732,28782,28750,28781,28731,13,13940,28832,13,3195,349,272,1192,302,1318,438,272,948,302,456,2007,28804
+1,8789,13,1270,285,28740,28732,28744,1329,13,12,28744,28792,28734,28793,2679,28705,28740,13,28724,327,285,28740,28732,28740,28731,13,28764,327,285,28750,5187,28740,2803,13,28712,327,285,28740,857,28775,1481,13,13940,28832,13,3195,1212,302,5851,1235,456,2007,7072,28804
+1,8789,13,1889,1984,2472,5888,13,12,1270,1848,1783,6743,944,28725,2095,1329,13,12,12,944,28723,1148,327,2095,13,28744,327,1984,2472,28732,28782,28731,13,28724,327,1984,2472,28732,28744,28723,1148,28731,13,13940,28832,13,3195,1212,302,1928,349,1318,28804
+1,8789,13,1801,327,28705,28734,13,1392,613,297,2819,28732,28740,28734,28734,28725,28705,28734,28725,387,28750,1329,13,12,1801,2679,613,13,13940,28832,13,3195,349,272,1192,302,1024,272,4008,727,1407,28705,28770,349,15731,28804
+1,8789,13,1270,18328,28732,28744,1329,13,12,807,1318,28736,28744,13,28724,327,18328,28732,28782,28731,13,28744,327,18328,28732,28724,28731,13,13940,28832,13,3195,349,272,1192,302,337,438,272,948,302,456,2007,28804
+1,8789,13,28744,327,28705,28782,13,28724,327,28705,28770,13,28764,327,28705,28787,13,28744,327,337,648,1318,13,13940,28832,13,3195,349,272,1192,302,1318,438,1407,28705,28750,28804
+1,8789,13,28744,327,28705,28740,13,335,1318,876,28705,28770,28747,13,12,28724,327,464,21558,28742,13,13940,28832,13,3195,349,272,1192,302,337,438,1407,28705,28781,28804
+1,8789,13,7841,25637,390,7494,13,28744,327,7494,28723,3506,385,28732,28740,28734,28731,13,13940,28832,13,3195,349,272,1192,302,1318,438,272,948,302,456,2007,28804
+1,8789,13,28724,327,1274,28732,2022,28732,501,28725,5936,28740,647,464,21558,647,464,28750,16433,13,13940,28832,13,3195,2118,1235,456,2007,7072,28804
+1,8789,13,335,1318,523,28705,28782,28747,13,12,4119,13,13940,28832,13,3195,2118,1235,456,2007,5439,28804
diff --git a/vllm_output.txt b/vllm_output.txt
new file mode 100644
index 000000000..fab763088
--- /dev/null
+++ b/vllm_output.txt
@@ -0,0 +1 @@
+RequestOutput(request_id=0, prompt=None, prompt_token_ids=[1, 8789, 13, 1889, 1984, 15503, 2472, 5888, 13, 12, 1270, 1848, 1783, 28732, 944, 28725, 2195, 1148, 1329, 13, 12, 12, 944, 28723, 8554, 1148, 327, 2195, 1148, 13, 1889, 1984, 3540, 2472, 28732, 5183, 15503, 2472, 1329, 13, 12, 1270, 1848, 1783, 6743, 944, 28725, 1083, 1148, 1329, 13, 12, 12, 944, 28723, 1666, 1148, 327, 1083, 1148, 13, 28744, 327, 1984, 15503, 2472, 28732, 28782, 28731, 13, 28724, 327, 1984, 3540, 2472, 28732, 28744, 28723, 8554, 1148, 28731, 13, 13940, 28832, 13, 3195, 349, 272, 1192, 302, 337, 28723, 1666, 1148, 438, 272, 948, 302, 456, 2007, 28804], encoder_prompt=None, encoder_prompt_token_ids=None, prompt_logprobs=None, outputs=[CompletionOutput(index=0, text='', token_ids=(13,), cumulative_logprob=-0.047923602163791656, logprobs=[{13: Logprob(logprob=-0.047923602163791656, rank=1, decoded_token='\n')}], finish_reason=stop, stop_reason=)], finished=True, metrics=RequestMetrics(arrival_time=1738946941.3884528, last_token_time=1738946941.3884528, first_scheduled_time=1738946941.3913035, first_token_time=1738946941.6071706, time_in_queue=0.002850770950317383, finished_time=1738946941.608071, scheduler_time=0.001762479543685913, model_forward_time=None, model_execute_time=None), lora_request=None, num_cached_tokens=0)
diff --git a/vllm_sampling_para.txt b/vllm_sampling_para.txt
new file mode 100644
index 000000000..17aabb410
--- /dev/null
+++ b/vllm_sampling_para.txt
@@ -0,0 +1 @@
+SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=1.0, top_k=-1, min_p=0.0, seed=None, stop=['\n', '</s>'], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=100, min_tokens=0, logprobs=0, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None)
\ No newline at end of file

From b23063be4435d6647f45a3f7932a3a096daac187 Mon Sep 17 00:00:00 2001
From: Qiujiang Chen <12012211@mail.sustech.edu.cn>
Date: Sat, 8 Feb 2025 10:26:41 +0800
Subject: [PATCH 03/10] Update vllm_output.txt

---
 vllm_output.txt | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/vllm_output.txt b/vllm_output.txt
index fab763088..d4fc354cb 100644
--- a/vllm_output.txt
+++ b/vllm_output.txt
@@ -1 +1,12 @@
-RequestOutput(request_id=0, prompt=None, prompt_token_ids=[1, 8789, 13, 1889, 1984, 15503, 2472, 5888, 13, 12, 1270, 1848, 1783, 28732, 944, 28725, 2195, 1148, 1329, 13, 12, 12, 944, 28723, 8554, 1148, 327, 2195, 1148, 13, 1889, 1984, 3540, 2472, 28732, 5183, 15503, 2472, 1329, 13, 12, 1270, 1848, 1783, 6743, 944, 28725, 1083, 1148, 1329, 13, 12, 12, 944, 28723, 1666, 1148, 327, 1083, 1148, 13, 28744, 327, 1984, 15503, 2472, 28732, 28782, 28731, 13, 28724, 327, 1984, 3540, 2472, 28732, 28744, 28723, 8554, 1148, 28731, 13, 13940, 28832, 13, 3195, 349, 272, 1192, 302, 337, 28723, 1666, 1148, 438, 272, 948, 302, 456, 2007, 28804], encoder_prompt=None, encoder_prompt_token_ids=None, prompt_logprobs=None, outputs=[CompletionOutput(index=0, text='', token_ids=(13,), cumulative_logprob=-0.047923602163791656, logprobs=[{13: Logprob(logprob=-0.047923602163791656, rank=1, decoded_token='\n')}], finish_reason=stop, stop_reason=)], finished=True, metrics=RequestMetrics(arrival_time=1738946941.3884528, last_token_time=1738946941.3884528, first_scheduled_time=1738946941.3913035, first_token_time=1738946941.6071706, time_in_queue=0.002850770950317383, finished_time=1738946941.608071, scheduler_time=0.001762479543685913, model_forward_time=None, model_execute_time=None), lora_request=None, num_cached_tokens=0)
+RequestOutput(request_id=0, 
+prompt=None, 
+prompt_token_ids=[1, 8789, 13, 1889, 1984, 15503, 2472, 5888, 13, 12, 1270, 1848, 1783, 28732, 944, 28725, 2195, 1148, 1329, 13, 12, 12, 944, 28723, 8554, 1148, 327, 2195, 1148, 13, 1889, 1984, 3540, 2472, 28732, 5183, 15503, 2472, 1329, 13, 12, 1270, 1848, 1783, 6743, 944, 28725, 1083, 1148, 1329, 13, 12, 12, 944, 28723, 1666, 1148, 327, 1083, 1148, 13, 28744, 327, 1984, 15503, 2472, 28732, 28782, 28731, 13, 28724, 327, 1984, 3540, 2472, 28732, 28744, 28723, 8554, 1148, 28731, 13, 13940, 28832, 13, 3195, 349, 272, 1192, 302, 337, 28723, 1666, 1148, 438, 272, 948, 302, 456, 2007, 28804], 
+encoder_prompt=None, 
+encoder_prompt_token_ids=None, prompt_logprobs=None, 
+outputs=[CompletionOutput(index=0, text='', token_ids=(13,), 
+  cumulative_logprob=-0.047923602163791656, 
+  logprobs=[{13: Logprob(logprob=-0.047923602163791656, rank=1, decoded_token='\n')}], 
+  finish_reason=stop, stop_reason=)], 
+finished=True, 
+metrics=RequestMetrics(arrival_time=1738946941.3884528, last_token_time=1738946941.3884528, first_scheduled_time=1738946941.3913035, first_token_time=1738946941.6071706, time_in_queue=0.002850770950317383, finished_time=1738946941.608071, scheduler_time=0.001762479543685913, model_forward_time=None, model_execute_time=None), 
+lora_request=None, num_cached_tokens=0)

From 85954f26758badfc11cb4136f18c513862b9e396 Mon Sep 17 00:00:00 2001
From: Jayon02 <12012211@mail.sustech.edu.cn>
Date: Sat, 8 Feb 2025 15:45:50 +0800
Subject: [PATCH 04/10] fix outputs bug

---
 src/lighteval/models/model_input.py         |   1 -
 src/lighteval/models/sglang/sglang_model.py | 235 ++++----------------
 src/lighteval/models/vllm/vllm_model.py     |   2 +-
 3 files changed, 45 insertions(+), 193 deletions(-)

diff --git a/src/lighteval/models/model_input.py b/src/lighteval/models/model_input.py
index bedc12b8d..575eb8024 100644
--- a/src/lighteval/models/model_input.py
+++ b/src/lighteval/models/model_input.py
@@ -120,7 +120,6 @@ def to_tgi_ie_dict(self) -> dict:
         }
         return {k: v for k, v in args.items() if v is not None}
 
-    # # TODO first: sampling parameter
     # def to_sglang_dict(self) -> dict:
     #     args = {
     #         "max_new_tokens": self.max_new_tokens,
diff --git a/src/lighteval/models/sglang/sglang_model.py b/src/lighteval/models/sglang/sglang_model.py
index b73d7bf67..a45efb9f5 100644
--- a/src/lighteval/models/sglang/sglang_model.py
+++ b/src/lighteval/models/sglang/sglang_model.py
@@ -82,36 +82,19 @@
 @dataclass
 class SGLANGModelConfig:
     pretrained: str #
-    trust_remote_code: bool = True #
+    load_format: str = "auto"
     dtype: str = "auto" #
-    tensor_parallel_size: int = 1  # how many GPUs to use for tensor parallelism
+    tp_size: int = 1  # how many GPUs to use for tensor parallelism
+    dp_size: int = 1  # how many GPUs to use for data parallelism
+    context_length: int | None = None
+    random_seed: Optional[int] = None
+    trust_remote_code: bool = True #
+    chat_template: Optional[str] = None
     device: str = "cuda"
-    disable_radix_cache: bool = True
-    seed: int = 42 #
-    disable_cuda_graph: bool = True
-    disable_cuda_graph_padding: bool = True
-    max_model_length: int | None = None  # maximum length of the model, ussually infered automatically. reduce this if you encouter OOM issues, 4096 is usually enough
-    return_token_ids: bool = True
-
-    gpu_memory_utilisation: float = 0.9  # lower this if you are running out of memory
-    revision: str = "main"  # revision of the model
+    skip_tokenizer_init: bool = False
+    kv_cache_dtype: str = "auto",
+    add_special_tokens: bool = True,
     pipeline_parallel_size: int = 1  # how many GPUs to use for pipeline parallelism
-    data_parallel_size: int = 1  # how many GPUs to use for data parallelism
-    swap_space: int = 4  # CPU swap space size (GiB) per GPU.
-    use_chat_template: bool = False
-    add_special_tokens: bool = True
-    multichoice_continuations_start_space: bool = (
-        True  # whether to add a space at the start of each continuation in multichoice generation
-    )
-    pairwise_tokenization: bool = False  # whether to tokenize the context and continuation separately or together.
-    generation_parameters: GenerationParameters = None  # sampling parameters to use for generation
-
-    subfolder: Optional[str] = None
-
-    def __post_init__(self):
-        if not self.generation_parameters:
-            self.generation_parameters = GenerationParameters()
-
 
 class SGLANGModel(LightevalModel):
     def __init__(
@@ -121,30 +104,18 @@ def __init__(
     ):
         """Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation."""
         self._config = config
-        self.use_chat_template = config.use_chat_template
-        self.data_parallel_size = int(config.data_parallel_size)
-        self.tensor_parallel_size = int(config.tensor_parallel_size)
-
-        self._add_special_tokens = config.add_special_tokens if config.add_special_tokens is not None else False
+        self.use_chat_template = config.chat_template is not None
+        self.data_parallel_size = int(config.dp_size)
+        self.tensor_parallel_size = int(config.tp_size)
+        self._add_special_tokens = bool(config.add_special_tokens)
         self._tokenizer = self._create_auto_tokenizer(config, env_config)
-
-        self._max_length = int(config.max_model_length) if config.max_model_length is not None else None
-
-        # If model_parallel is not set we compare the number of processes with the number of GPUs
+        self._max_length = int(config.context_length) if config.context_length is not None else 256
         self.model = self._create_auto_model(config, env_config)
-
-        # self._device = config.accelerator.device if config.accelerator is not None else "cpu"
-        self.multichoice_continuations_start_space = config.multichoice_continuations_start_space
-
         self.model_name = _simplify_name(config.pretrained)
         self.model_sha = ""  # config.get_model_sha()
         self.precision = _get_dtype(config.dtype, config=self._config)
-
         self.model_info = ModelInfo(model_name=self.model_name, model_sha=self.model_sha)
-        # self.sampling_params = SamplingParams(**config.generation_parameters.to_sglang_dict())
-        self.sampling_params = dict()
-        self.pairwise_tokenization = config.pairwise_tokenization
-
+ 
     @property
     def tokenizer(self):
         return self._tokenizer
@@ -155,8 +126,6 @@ def cleanup(self):
             del self.model.llm_engine.model_executor.driver_worker
         self.model = None
         gc.collect()
-        # TODO: check sglang dependency: ray flashinfer ray?
-        # ray.shutdown()
         destroy_distributed_environment()
         torch.cuda.empty_cache()
 
@@ -186,43 +155,20 @@ def _create_auto_model(self, config: SGLANGModelConfig, env_config: EnvConfig) -
         Returns:
             transformers.PreTrainedModel: The created auto model instance.
         """
-        # self.model_args = {
-        #     "model": config.pretrained,
-        #     "gpu_memory_utilization": float(config.gpu_memory_utilisation),
-        #     "revision": config.revision + (f"/{config.subfolder}" if config.subfolder is not None else ""),
-        #     "dtype": config.dtype,
-        #     "trust_remote_code": config.trust_remote_code,
-        #     "tensor_parallel_size": int(config.tensor_parallel_size),
-        #     "pipeline_parallel_size": int(config.pipeline_parallel_size),
-        #     "max_model_len": self._max_length,
-        #     "swap_space": 4,
-        #     "seed": 1234,
-        # }
-
         # TODO: double check
         self.model_args  = {
             "model_path": config.pretrained,
             "trust_remote_code": config.trust_remote_code,
             "dtype": config.dtype,
-            "tp_size": int(config.tensor_parallel_size),
             "device": "cuda",
-            "disable_radix_cache": config.disable_radix_cache,
-            "random_seed": config.seed,
-            "disable_cuda_graph": config.disable_cuda_graph,
-            "disable_cuda_graph_padding": config.disable_cuda_graph_padding,
-            "context_length": self._max_length,
+            "random_seed": config.random_seed,
+            "load_format": config.load_format,
+            "context_length": int(self._max_length) if self._max_length else None,
+            "dp_size": int(config.dp_size),
+            "tp_size": int(config.tp_size),
             "log_level": "info",
-            # "return_token_ids": True,
-
-            "revision": config.revision + (f"/{config.subfolder}" if config.subfolder is not None else ""),
         }
 
-        # TODO: double check
-        # if int(config.data_parallel_size) > 1:
-        #     self.model_args["distributed_executor_backend"] = "ray"
-        #     self._batch_size = "auto"
-        #     return None
-
         model = Engine(**self.model_args)
 
         # TODO: double check
@@ -239,7 +185,7 @@ def _create_auto_tokenizer(self, config: SGLANGModelConfig, env_config: EnvConfi
             config.pretrained,
             tokenizer_mode="auto",
             trust_remote_code=config.trust_remote_code,
-            tokenizer_revision=config.revision,
+            tokenizer_revision="main",
         )
         tokenizer.pad_token = tokenizer.eos_token
         return tokenizer
@@ -294,34 +240,36 @@ def greedy_until(
             # of losing some meaning, or have some generations that are exceedingly short?
             # The choice we go for here is to avoid truncating the prompt if we can, since it
             # should have been managed by the prompt creator/few shot manager if requested by the user.
+            
             inputs = tokenized["input_ids"]
             context_size = len(inputs[0])
 
             # left truncate the inputs to the maximum length
-            # if max_new_tokens is not None:
-            #     if context_size + max_new_tokens > self.max_length:
-            #         logger.warning(
-            #             f"{context_size + max_new_tokens=} which is greather than {self.max_length=}. Truncating context to {self.max_length - max_new_tokens} tokens."
-            #         )
-            #         context_size = self.max_length - max_new_tokens
-            #         inputs = [input[-context_size:] for input in inputs]
-            # else:
-            #     if context_size > self.max_length:
-            #         logger.warning(
-            #             f"{context_size=} which is greather than {self.max_length=}. Truncating context to {self.max_length} tokens."
-            #         )
-            #         context_size = self.max_length
-            #         inputs = [input[-context_size:] for input in inputs]
+            if max_new_tokens is not None:
+                if context_size + max_new_tokens > self.max_length:
+                    logger.warning(
+                        f"{context_size + max_new_tokens=} which is greather than {self.max_length=}. Truncating context to {self.max_length - max_new_tokens} tokens."
+                    )
+                    context_size = self.max_length - max_new_tokens
+                    inputs = [input[-context_size:] for input in inputs]
+            else:
+                if context_size > self.max_length:
+                    logger.warning(
+                        f"{context_size=} which is greather than {self.max_length=}. Truncating context to {self.max_length} tokens."
+                    )
+                    context_size = self.max_length
+                    inputs = [input[-context_size:] for input in inputs]
 
             sglang_outputs = self._generate(
                 inputs=inputs,
                 max_new_tokens=max_new_tokens,
                 stop_tokens=stop_tokens,
-                returns_logits=returns_logits,
                 num_samples=num_samples,
             )
 
             for sglang_output in sglang_outputs:
+                print(sglang_output)
+                exit(0)
                 output_token_ids = [outputs.token_ids for outputs in sglang_output.outputs]
                 logprobs = [output.logprobs for output in sglang_output.outputs] or []
                 logprobs = [logprob[token_id].logprob for token_id, logprob in zip(output_token_ids[0], logprobs[0])]
@@ -343,132 +291,37 @@ def _generate(
         inputs: list[list[int]],
         max_new_tokens: Optional[int] = None,
         stop_tokens: Optional[list[str]] = None,
-        returns_logits: Optional[bool] = False,
         num_samples: int = 1,
         generate: bool = True,
     ) -> list[GenerativeResponse]:
         """Contains the actual logic of the generation."""
         # TODO: double check without clone
-        # bug: params are wrong
         # sampling_params = self.sampling_params
-        # if generate:
-        #     sampling_params.n = num_samples
-        #     sampling_params.max_tokens = max_new_tokens
-        #     sampling_params.stop = stop_tokens
-        #     sampling_params.logprobs = 1 if returns_logits else 0
-
-        # else:
-        #     sampling_params.temperature = 0
-        #     sampling_params.prompt_logprobs = 1
-        #     sampling_params.max_tokens = 1
-        #     sampling_params.detokenize = False
-        
+
         params = dict(
             top_p=1.0,
             top_k=-1,
+            min_p=0,
             max_new_tokens=max_new_tokens,
-            stop=stop_tokens,
+            # stop=stop_tokens,
             temperature=1.0,
             repetition_penalty=1.0,
             skip_special_tokens=True,
             spaces_between_special_tokens=True
         )
 
-        ## Jayon02: how do sglang handle this
-        # if self.data_parallel_size > 1:
-        #     # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
-        #     # also seems to only work with decorator and not with ray.remote() fn
-        #     # see https://github.com/vllm-project/vllm/issues/973
-        #     # note: this has changed on 0.3.3, and it only works now if num_gpus are set.
-        #     # but then tensor_parallel breaks
-        #     # Hynek: With the newest vllm, it actually breaks when tensor_parallel_size == 1 and num_gpus not set,
-        #     # as VLLM complains about no GPUs available.
-        #     @ray.remote(num_gpus=1 if self.tensor_parallel_size == 1 else None)
-        #     def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, requests):
-        #         llm = LLM(**model_args)
-        #         return llm.generate(prompt_token_ids=requests, sampling_params=sampling_params)
-        #
-        #     # dispatch requests to all self.data_parallel_size workers, in interleaved fashion
-        #     # interleaved important to balance context lengths across workers
-        #     requests = [list(x) for x in distribute(self.data_parallel_size, inputs)]
-        #     inputs = ((self.model_args, sampling_params, req) for req in requests)
-        #     object_refs = [run_inference_one_model.remote(*x) for x in inputs]
-        #     results = ray.get(object_refs)
-        #     # Invoke ray.shutdown() to prevent hang-ups if subsequent calls required.
-        #     ray.shutdown()
-        #     # flatten results
-        #     outputs = [
-        #         x
-        #         for x in itertools.chain.from_iterable(itertools.zip_longest(*[list(x) for x in results]))
-        #         if x is not None
-        #     ]
-        # else:
-        #     outputs = self.model.generate(
-        #         prompt_token_ids=inputs,
-        #         sampling_params=sampling_params,
-        #         use_tqdm=True,
-        #     )
-
-        # print(params)
-        # exit(0)
-
         outputs = self.model.generate(
                 input_ids=inputs,
                 sampling_params=params,
+                return_logprob=True,
             )
 
-        # outputs = self.model.generate(
-        #         inputs,
-        #         params,
-        #     )
-
         return outputs
 
     def loglikelihood(
         self, requests: list[LoglikelihoodRequest], override_bs: Optional[int] = None
     ) -> list[LoglikelihoodResponse]:
-        for request in requests:
-            if request.context == "":
-                request.tokenized_context = [self.tokenizer.eos_token_id]
-                request.tokenized_continuation = self.tok_encode(request.choice)
-            else:
-                # The following line is mandatory for compatibility with the harness
-                request.tokenized_context, request.tokenized_continuation = self.tok_encode_pair(
-                    request.context, request.choice, pairwise=self.pairwise_tokenization
-                )
-        return self._loglikelihood_tokens(requests, override_bs=override_bs)
-
-    def _loglikelihood_tokens(
-        self,
-        requests: list[LoglikelihoodRequest],
-        override_bs: int = -1,
-        return_bool_score: bool = True,
-        rolling: bool = False,
-    ) -> list[LoglikelihoodResponse]:
-        dataset = LoglikelihoodDataset(requests=requests, num_dataset_splits=1)
-        res = []
-
-        for _ in tqdm(dataset.splits_start_end_iterator()):
-            # the last token is an eos token, so we don't need to add it
-            inputs = [dataset[i].tokenized_context + dataset[i].tokenized_continuation for i in range(len(dataset))]
-            # Left truncate the inputs to the maximum length
-            inputs = [input[-self.max_length :] for input in inputs]
-            outputs = self._generate(inputs, generate=False)
-
-            for output, input in zip(outputs, dataset):
-                continuation_logprobs = []
-                for token, logprobs in zip(input.tokenized_continuation[::-1], output.prompt_logprobs[::-1]):
-                    continuation_logprobs.append(logprobs[token])
-                bool_score = all(logprob.rank == 1 for logprob in continuation_logprobs)
-                continuation_logprobs = [logprob.logprob for logprob in continuation_logprobs]
-                answer = LoglikelihoodResponse(
-                    input_tokens=input.tokenized_context + input.tokenized_continuation,
-                    generated_tokens=input.tokenized_continuation,
-                    result=(sum(continuation_logprobs), bool_score if return_bool_score else None),
-                )
-                res.append(answer)
-
-        return dataset.get_original_order(res)
+        pass
 
     def loglikelihood_rolling():
         pass
diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py
index b86850a6c..cd493a980 100644
--- a/src/lighteval/models/vllm/vllm_model.py
+++ b/src/lighteval/models/vllm/vllm_model.py
@@ -48,7 +48,6 @@
 
 logger = logging.getLogger(__name__)
 
-## Jayon02: sglang with what dependency, ray? flashinfer?
 if is_vllm_available():
     import ray
     from more_itertools import distribute
@@ -287,6 +286,7 @@ def greedy_until(
             )
 
             for vllm_output in vllm_outputs:
+                
                 output_token_ids = [outputs.token_ids for outputs in vllm_output.outputs]
                 logprobs = [output.logprobs for output in vllm_output.outputs] or []
                 logprobs = [logprob[token_id].logprob for token_id, logprob in zip(output_token_ids[0], logprobs[0])]

From 985d366a790f26322161712f6f8fc1b1ba1cf222 Mon Sep 17 00:00:00 2001
From: Jayon02 <12012211@mail.sustech.edu.cn>
Date: Sat, 8 Feb 2025 23:24:24 +0800
Subject: [PATCH 05/10] fix outputs bug

---
 src/lighteval/models/sglang/sglang_model.py | 37 ++++++++++++---------
 src/lighteval/models/vllm/vllm_model.py     |  1 -
 2 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/src/lighteval/models/sglang/sglang_model.py b/src/lighteval/models/sglang/sglang_model.py
index a45efb9f5..0a64cafcd 100644
--- a/src/lighteval/models/sglang/sglang_model.py
+++ b/src/lighteval/models/sglang/sglang_model.py
@@ -54,8 +54,6 @@
 from sglang.srt.sampling.sampling_params import SamplingParams
 from sglang.srt.distributed.parallel_state import destroy_model_parallel, destroy_distributed_environment
 
-# from vllm.distributed.parallel_state import destroy_distributed_environment, destroy_model_parallel
-
 logging.getLogger("sglang").propagate = True
 logging.getLogger("sglang").handlers.clear()
 
@@ -123,7 +121,7 @@ def tokenizer(self):
     def cleanup(self):
         destroy_model_parallel()
         if self.model is not None:
-            del self.model.llm_engine.model_executor.driver_worker
+            self.model.shutdown()
         self.model = None
         gc.collect()
         destroy_distributed_environment()
@@ -169,6 +167,9 @@ def _create_auto_model(self, config: SGLANGModelConfig, env_config: EnvConfig) -
             "log_level": "info",
         }
 
+        if config.dp_size > 1:
+            pass
+
         model = Engine(**self.model_args)
 
         # TODO: double check
@@ -267,15 +268,17 @@ def greedy_until(
                 num_samples=num_samples,
             )
 
-            for sglang_output in sglang_outputs:
-                print(sglang_output)
-                exit(0)
-                output_token_ids = [outputs.token_ids for outputs in sglang_output.outputs]
-                logprobs = [output.logprobs for output in sglang_output.outputs] or []
-                logprobs = [logprob[token_id].logprob for token_id, logprob in zip(output_token_ids[0], logprobs[0])]
-                result = [output.text for output in sglang_output.outputs]
-                input_token_ids = sglang_output.prompt_token_ids
-
+            for i in range(len(sglang_outputs)):
+                sglang_output = sglang_outputs[i]
+                # print(sglang_output)
+                # exit(0)
+                meta_info = sglang_output["meta_info"]
+                output_token_logprobs = meta_info["output_token_logprobs"]
+                output_token_ids = [output[1] for output in output_token_logprobs]
+                logprobs = [output[0] for output in output_token_logprobs]
+                result = [sglang_output["text"]]
+                input_token_ids = inputs[i]
+        
                 cur_response = GenerativeResponse(
                     result=result,
                     logits=logprobs,
@@ -296,20 +299,24 @@ def _generate(
     ) -> list[GenerativeResponse]:
         """Contains the actual logic of the generation."""
         # TODO: double check without clone
-        # sampling_params = self.sampling_params
 
         params = dict(
             top_p=1.0,
             top_k=-1,
             min_p=0,
             max_new_tokens=max_new_tokens,
-            # stop=stop_tokens,
+            stop=stop_tokens,
             temperature=1.0,
             repetition_penalty=1.0,
             skip_special_tokens=True,
-            spaces_between_special_tokens=True
+            spaces_between_special_tokens=True,
+            n=num_samples
         )
 
+        if not generate:
+            params.temperature = 0
+            params.max_tokens = 1
+
         outputs = self.model.generate(
                 input_ids=inputs,
                 sampling_params=params,
diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py
index cd493a980..a052b97ba 100644
--- a/src/lighteval/models/vllm/vllm_model.py
+++ b/src/lighteval/models/vllm/vllm_model.py
@@ -286,7 +286,6 @@ def greedy_until(
             )
 
             for vllm_output in vllm_outputs:
-                
                 output_token_ids = [outputs.token_ids for outputs in vllm_output.outputs]
                 logprobs = [output.logprobs for output in vllm_output.outputs] or []
                 logprobs = [logprob[token_id].logprob for token_id, logprob in zip(output_token_ids[0], logprobs[0])]

From c395b1e4a61f74ca838f7f782f950b26d8a819f8 Mon Sep 17 00:00:00 2001
From: Chayenne <zhaochenyang@ucla.edu>
Date: Mon, 10 Feb 2025 06:08:20 -0800
Subject: [PATCH 06/10] adjust precision

---
 sglang_inputs_token.txt                     |  16 --
 sglang_output.txt                           |  11 --
 sglang_sampling_para.txt                    |   1 -
 src/lighteval/__main__.py                   |   1 -
 src/lighteval/main_sglang.py                |   5 -
 src/lighteval/main_vllm.py                  |   2 -
 src/lighteval/models/model_input.py         |  19 +--
 src/lighteval/models/sglang/sglang_model.py | 171 ++++++++++----------
 src/lighteval/models/vllm/vllm_model.py     |  12 +-
 src/lighteval/pipeline.py                   |   5 +-
 vllm_inputs_token.txt                       |  16 --
 vllm_output.txt                             |  12 --
 vllm_sampling_para.txt                      |   1 -
 13 files changed, 97 insertions(+), 175 deletions(-)
 delete mode 100644 sglang_inputs_token.txt
 delete mode 100644 sglang_output.txt
 delete mode 100644 sglang_sampling_para.txt
 delete mode 100644 vllm_inputs_token.txt
 delete mode 100644 vllm_output.txt
 delete mode 100644 vllm_sampling_para.txt

diff --git a/sglang_inputs_token.txt b/sglang_inputs_token.txt
deleted file mode 100644
index 76bbf4434..000000000
--- a/sglang_inputs_token.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-1,8789,13,1889,1984,15503,2472,5888,13,12,1270,1848,1783,28732,944,28725,2195,1148,1329,13,12,12,944,28723,8554,1148,327,2195,1148,13,1889,1984,3540,2472,28732,5183,15503,2472,1329,13,12,1270,1848,1783,6743,944,28725,1083,1148,1329,13,12,12,944,28723,1666,1148,327,1083,1148,13,28744,327,1984,15503,2472,28732,28782,28731,13,28724,327,1984,3540,2472,28732,28744,28723,8554,1148,28731,13,13940,28832,13,3195,349,272,1192,302,337,28723,1666,1148,438,272,948,302,456,2007,28804
-1,8789,13,28715,327,371,28740,28747,464,21558,647,28705,28750,28747,464,17664,647,28705,28770,28747,464,12684,3970,14491,13,1042,28730,327,9842,13,1392,613,297,2819,28732,2004,28732,28715,24770,13,12,335,613,1239,28705,28750,859,28705,28740,28747,13,12,12,1042,28730,2679,281,28792,28710,28793,13,13940,28832,13,3195,349,272,1192,302,1117,28730,1024,456,2007,4546,274,28804
-1,8789,13,1270,408,28732,28711,1329,13,12,335,307,859,28705,28740,28747,13,12,12,807,28705,28740,13,12,2013,28747,13,12,12,807,307,648,408,28732,28711,28733,28740,28731,13,28764,327,408,28732,28740,28734,28731,13,28724,327,408,6422,28740,28734,28731,13,13940,28832,13,3195,1212,302,5851,1235,456,2007,7072,28804
-1,8789,13,1889,1984,2472,5888,13,12,1270,1848,1783,6743,944,28725,2095,1329,13,12,12,944,28723,1148,327,2095,13,28744,327,1984,2472,28732,28782,28731,13,28724,327,1984,2472,28732,28744,28723,1148,28731,13,13940,28832,13,3195,349,272,1192,302,337,28723,1148,438,272,948,302,456,2007,28804
-1,8789,13,1889,1984,2472,5888,13,12,1270,1848,1783,6743,944,28725,2095,1329,13,12,12,944,28723,1148,327,2095,13,28744,327,1984,2472,28732,28782,28731,13,28724,327,1984,2472,28732,28744,28723,1148,28731,13,13940,28832,13,3195,349,272,1192,302,1318,28723,1148,438,272,948,302,456,2007,28804
-1,8789,13,1270,1369,28732,28744,1329,13,12,1392,613,297,2819,28732,28750,28725,1318,348,28732,28734,28723,28782,24770,13,12,12,335,1318,1239,716,28732,28710,28731,859,28705,28734,28747,13,12,12,12,807,8250,13,12,807,6110,13,13940,28832,13,3195,349,1369,28732,28740,28734,28740,11840
-1,8789,13,1270,285,28740,5888,13,12,807,1117,28732,28744,28731,648,464,21558,28742,13,1270,285,28750,5888,13,12,807,285,28740,28732,28750,28736,28744,28731,13,28744,327,285,28750,28732,28782,28750,28781,28731,13,13940,28832,13,3195,349,272,1192,302,1318,438,272,948,302,456,2007,28804
-1,8789,13,1270,285,28740,28732,28744,1329,13,12,28744,28792,28734,28793,2679,28705,28740,13,28724,327,285,28740,28732,28740,28731,13,28764,327,285,28750,5187,28740,2803,13,28712,327,285,28740,857,28775,1481,13,13940,28832,13,3195,1212,302,5851,1235,456,2007,7072,28804
-1,8789,13,1889,1984,2472,5888,13,12,1270,1848,1783,6743,944,28725,2095,1329,13,12,12,944,28723,1148,327,2095,13,28744,327,1984,2472,28732,28782,28731,13,28724,327,1984,2472,28732,28744,28723,1148,28731,13,13940,28832,13,3195,1212,302,1928,349,1318,28804
-1,8789,13,1801,327,28705,28734,13,1392,613,297,2819,28732,28740,28734,28734,28725,28705,28734,28725,387,28750,1329,13,12,1801,2679,613,13,13940,28832,13,3195,349,272,1192,302,1024,272,4008,727,1407,28705,28770,349,15731,28804
-1,8789,13,1270,18328,28732,28744,1329,13,12,807,1318,28736,28744,13,28724,327,18328,28732,28782,28731,13,28744,327,18328,28732,28724,28731,13,13940,28832,13,3195,349,272,1192,302,337,438,272,948,302,456,2007,28804
-1,8789,13,28744,327,28705,28782,13,28724,327,28705,28770,13,28764,327,28705,28787,13,28744,327,337,648,1318,13,13940,28832,13,3195,349,272,1192,302,1318,438,1407,28705,28750,28804
-1,8789,13,28744,327,28705,28740,13,335,1318,876,28705,28770,28747,13,12,28724,327,464,21558,28742,13,13940,28832,13,3195,349,272,1192,302,337,438,1407,28705,28781,28804
-1,8789,13,7841,25637,390,7494,13,28744,327,7494,28723,3506,385,28732,28740,28734,28731,13,13940,28832,13,3195,349,272,1192,302,1318,438,272,948,302,456,2007,28804
-1,8789,13,28724,327,1274,28732,2022,28732,501,28725,5936,28740,647,464,21558,647,464,28750,16433,13,13940,28832,13,3195,2118,1235,456,2007,7072,28804
-1,8789,13,335,1318,523,28705,28782,28747,13,12,4119,13,13940,28832,13,3195,2118,1235,456,2007,5439,28804
diff --git a/sglang_output.txt b/sglang_output.txt
deleted file mode 100644
index e546c060d..000000000
--- a/sglang_output.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-│ │                  │   {                                                                       │ │
-│ │                  │   │   'text': ' Answer according to: For the given dictionary d with keys │ │
-│ │                  being integers, trave'+208,                                                 │ │
-│ │                  │   │   'meta_info': {                                                      │ │
-│ │                  │   │   │   'id': '04224678f3a24f7abfafef26cd6d101f',                       │ │
-│ │                  │   │   │   'finish_reason': {'type': 'stop', 'matched': '\n'},             │ │
-│ │                  │   │   │   'prompt_tokens': 77,                                            │ │
-│ │                  │   │   │   'completion_tokens': 58,                                        │ │
-│ │                  │   │   │   'cached_tokens': 0                                              │ │
-│ │                  │   │   }                                                                   │ │
-│ │                  │   }, 
\ No newline at end of file
diff --git a/sglang_sampling_para.txt b/sglang_sampling_para.txt
deleted file mode 100644
index e4813cdca..000000000
--- a/sglang_sampling_para.txt
+++ /dev/null
@@ -1 +0,0 @@
-{'top_p': 1.0, 'top_k': -1, 'max_new_tokens': 100, 'stop': ['\n', '</s>'], 'temperature': 1.0, 'repetition_penalty': 1.0, 'skip_special_tokens': True, 'spaces_between_special_tokens': True}
\ No newline at end of file
diff --git a/src/lighteval/__main__.py b/src/lighteval/__main__.py
index 77312f57c..1e79188dd 100644
--- a/src/lighteval/__main__.py
+++ b/src/lighteval/__main__.py
@@ -64,7 +64,6 @@
 app.command(rich_help_panel="Evaluation Backends")(lighteval.main_accelerate.accelerate)
 app.command(rich_help_panel="Evaluation Utils")(lighteval.main_baseline.baseline)
 app.command(rich_help_panel="Evaluation Backends")(lighteval.main_nanotron.nanotron)
-# Jayon02: add vllm cmd
 app.command(rich_help_panel="Evaluation Backends")(lighteval.main_vllm.vllm)
 app.command(rich_help_panel="Evaluation Backends")(lighteval.main_sglang.sglang)
 app.add_typer(
diff --git a/src/lighteval/main_sglang.py b/src/lighteval/main_sglang.py
index 0c662a9dd..539867105 100644
--- a/src/lighteval/main_sglang.py
+++ b/src/lighteval/main_sglang.py
@@ -1,4 +1,3 @@
-# TODO: change to what?
 # MIT License
 
 # Copyright (c) 2024 The HuggingFace Team
@@ -35,7 +34,6 @@
 HELP_PANEL_NAME_3 = "Debug Parameters"
 HELP_PANEL_NAME_4 = "Modeling Parameters"
 
-# TODO: change
 def sglang(
     # === general ===
     model_args: Annotated[
@@ -117,7 +115,6 @@ def sglang(
         hub_results_org=results_org,
     )
 
-    ## Jayon02: vllm pipeline parameter
     pipeline_params = PipelineParameters(
         launcher_type=ParallelismManager.SGLANG,
         env_config=env_config,
@@ -132,7 +129,6 @@ def sglang(
         load_responses_from_details_date_id=load_responses_from_details_date_id,
     )
 
-    ## Jayon02: support two ways to load model
     if model_args.endswith(".yaml"):
         with open(model_args, "r") as f:
             config = yaml.safe_load(f)["model"]
@@ -144,7 +140,6 @@ def sglang(
         model_args_dict: dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in model_args.split(",")}
 
         model_config = SGLANGModelConfig(**model_args_dict)
-
     pipeline = Pipeline(
         tasks=tasks,
         pipeline_parameters=pipeline_params,
diff --git a/src/lighteval/main_vllm.py b/src/lighteval/main_vllm.py
index 90132dcba..130b5ff56 100644
--- a/src/lighteval/main_vllm.py
+++ b/src/lighteval/main_vllm.py
@@ -116,7 +116,6 @@ def vllm(
         hub_results_org=results_org,
     )
 
-    ## Jayon02: vllm pipeline parameter
     pipeline_params = PipelineParameters(
         launcher_type=ParallelismManager.VLLM,
         env_config=env_config,
@@ -131,7 +130,6 @@ def vllm(
         load_responses_from_details_date_id=load_responses_from_details_date_id,
     )
 
-    ## Jayon02: support two ways to load model
     if model_args.endswith(".yaml"):
         with open(model_args, "r") as f:
             config = yaml.safe_load(f)["model"]
diff --git a/src/lighteval/models/model_input.py b/src/lighteval/models/model_input.py
index 575eb8024..c91d9fbc9 100644
--- a/src/lighteval/models/model_input.py
+++ b/src/lighteval/models/model_input.py
@@ -120,16 +120,11 @@ def to_tgi_ie_dict(self) -> dict:
         }
         return {k: v for k, v in args.items() if v is not None}
 
-    # def to_sglang_dict(self) -> dict:
-    #     args = {
-    #         "max_new_tokens": self.max_new_tokens,
-    #         "min_new_tokens": self.min_new_tokens,
-    #         "stop_token_ids": self.stop_tokens,
-    #         "temperature": self.temperature,
-    #         "top_k": self.top_k,
-    #         "top_p": self.top_p,
-    #         "min_p": self.min_p,
-    #         "repetition_penalty": self.repetition_penalty,
-    #     }
-    #     return {k: v for k, v in args.items() if v is not None}
+    def to_sglang_dict(self) -> dict:
+        args = {
+            "max_new_tokens": self.max_new_tokens,
+            "stop_token_ids": self.stop_tokens,
+            "temperature": self.temperature,
+        }
+        return {k: v for k, v in args.items() if v is not None}
 
diff --git a/src/lighteval/models/sglang/sglang_model.py b/src/lighteval/models/sglang/sglang_model.py
index 0a64cafcd..7bf3b999c 100644
--- a/src/lighteval/models/sglang/sglang_model.py
+++ b/src/lighteval/models/sglang/sglang_model.py
@@ -21,9 +21,10 @@
 # SOFTWARE.
 
 import gc
-import itertools
 import logging
 import os
+import subprocess
+import signal
 from dataclasses import dataclass
 from typing import Optional
 
@@ -45,55 +46,47 @@
 from lighteval.utils.imports import is_sglang_available
 from lighteval.utils.utils import EnvConfig, as_list
 
-
 logger = logging.getLogger(__name__)
 
-from more_itertools import distribute
-from sglang import Engine
-from sglang.srt.hf_transformers_utils import get_tokenizer
-from sglang.srt.sampling.sampling_params import SamplingParams
-from sglang.srt.distributed.parallel_state import destroy_model_parallel, destroy_distributed_environment
-
-logging.getLogger("sglang").propagate = True
-logging.getLogger("sglang").handlers.clear()
-
-## Jayon02: sglang with what dependency, ray? flashinfer?
-# if is_sglang_available():
-#     from more_itertools import distribute
-#     from vllm import LLM, SamplingParams
-#     from vllm.distributed.parallel_state import destroy_distributed_environment, destroy_model_parallel
-#     from vllm.transformers_utils.tokenizer import get_tokenizer
-#
-#     logging.getLogger("sglang").propagate = True
-#     logging.getLogger("sglang").handlers.clear()
-# else:
-#     LLM = None
-#     SamplingParams = None
-#     get_tokenizer = None
-#     distribute = None
+if is_sglang_available():
+    from sglang import Engine
+    from sglang.srt.hf_transformers_utils import get_tokenizer
+    from sglang.srt.distributed.parallel_state import destroy_model_parallel, destroy_distributed_environment
+    
+    logging.getLogger("sglang").propagate = True
+    logging.getLogger("sglang").handlers.clear()
+else:
+    Engine = None
+    get_tokenizer = None
 
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 STARTING_BATCH_SIZE = 512
 
-## change to all sglang config
 @dataclass
 class SGLANGModelConfig:
-    pretrained: str #
+    pretrained: str
     load_format: str = "auto"
-    dtype: str = "auto" #
+    dtype: str = "auto"
     tp_size: int = 1  # how many GPUs to use for tensor parallelism
     dp_size: int = 1  # how many GPUs to use for data parallelism
     context_length: int | None = None
-    random_seed: Optional[int] = None
-    trust_remote_code: bool = True #
-    chat_template: Optional[str] = None
+    random_seed: Optional[int] = 1234
+    trust_remote_code: bool = False
+    chat_template: Optional[str] = None # no use
+    use_chat_template: bool = False
     device: str = "cuda"
     skip_tokenizer_init: bool = False
-    kv_cache_dtype: str = "auto",
-    add_special_tokens: bool = True,
+    kv_cache_dtype: str = "auto"
+    add_special_tokens: bool = True
     pipeline_parallel_size: int = 1  # how many GPUs to use for pipeline parallelism
 
+    generation_parameters: GenerationParameters = None
+    
+    def __post_init__(self):
+        if not self.generation_parameters:
+            self.generation_parameters = GenerationParameters()
+
 class SGLANGModel(LightevalModel):
     def __init__(
         self,
@@ -102,26 +95,57 @@ def __init__(
     ):
         """Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation."""
         self._config = config
-        self.use_chat_template = config.chat_template is not None
+        self.use_chat_template = config.use_chat_template
         self.data_parallel_size = int(config.dp_size)
         self.tensor_parallel_size = int(config.tp_size)
         self._add_special_tokens = bool(config.add_special_tokens)
         self._tokenizer = self._create_auto_tokenizer(config, env_config)
-        self._max_length = int(config.context_length) if config.context_length is not None else 256
+        self._max_length = int(config.context_length) if config.context_length is not None else None
         self.model = self._create_auto_model(config, env_config)
         self.model_name = _simplify_name(config.pretrained)
         self.model_sha = ""  # config.get_model_sha()
         self.precision = _get_dtype(config.dtype, config=self._config)
+        self.sampling_params = config.generation_parameters.to_sglang_dict()
         self.model_info = ModelInfo(model_name=self.model_name, model_sha=self.model_sha)
- 
+        
     @property
     def tokenizer(self):
         return self._tokenizer
 
     def cleanup(self):
+        
+        def reap_children(signum, frame):
+            try:
+                while True:
+                    pid, status = os.waitpid(-1, os.WNOHANG)
+                    if pid == 0:
+                        break
+                    print(f"Reaped child process {pid} with status {status}")
+            except ChildProcessError:
+                pass
+
+        signal.signal(signal.SIGCHLD, reap_children)
+        
+        
         destroy_model_parallel()
         if self.model is not None:
             self.model.shutdown()
+            result = subprocess.run(["nvidia-smi", "--query-compute-apps=pid,process_name,gpu_uuid",
+                "--format=csv,noheader,nounits"], capture_output=True, text=True)
+            lines = result.stdout.strip().split("\n")
+            target_pids = []
+
+            for line in lines:
+                parts = [p.strip() for p in line.split(",")]
+                if len(parts) < 2:
+                    continue
+                pid, process_name = parts[:2]
+                if process_name == "sglang::scheduler":
+                    target_pids.append(pid)
+                    
+            for pid in target_pids:
+                os.kill(int(pid), 9)
+
         self.model = None
         gc.collect()
         destroy_distributed_environment()
@@ -136,23 +160,7 @@ def max_length(self) -> int:
         return self._max_length
 
     def _create_auto_model(self, config: SGLANGModelConfig, env_config: EnvConfig) -> Optional[Engine]:
-        """
-        Creates an instance of the pretrained HF model.
 
-        Args:
-            pretrained (str): The name or path of the pretrained model.
-            revision (str): The revision of the model.
-            subfolder (Optional[str], optional): The subfolder within the model. Defaults to None.
-            max_memory (Optional[dict], optional): The maximum memory to allocate for the model per GPU. Defaults to None.
-            device_map (Optional[dict], optional): The device mapping for the model. Defaults to None.
-            torch_dtype (Optional[Union[str, torch.dtype]], optional): The torch data type for the model. Defaults to None.
-            quantization_config (Optional[Union[BitsAndBytesConfig, GPTQConfig]], optional): The quantization configuration for the model. Defaults to None.
-            trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False.
-            cache_dir (str, optional): The cache directory for the model. Defaults to "/scratch".
-
-        Returns:
-            transformers.PreTrainedModel: The created auto model instance.
-        """
         # TODO: double check
         self.model_args  = {
             "model_path": config.pretrained,
@@ -172,12 +180,8 @@ def _create_auto_model(self, config: SGLANGModelConfig, env_config: EnvConfig) -
 
         model = Engine(**self.model_args)
 
-        # TODO: double check
-        # If the max_length can't get extracted from the config, it will be inferred from the model
-        # Inferring from the tokenizer will cause vllm to bug for models with mismatches between model
-        # config and tk config, like mistralai/Mistral-7B-v0.1
-        # if self._max_length is None:
-        #    self._max_length = model.llm_engine.model_config.max_seq_len_to_capture
+        if self._max_length is None:
+           self._max_length = 8192
 
         return model
 
@@ -218,15 +222,12 @@ def greedy_until(
             total=dataset.num_dataset_splits,
             desc="Splits",
             position=0,
-            disable=False,  # self.disable_tqdm,
+            disable=False,
         ):
-            # For chat models, generation stops with EOS token, so we don't need to specify stop tokens
+            
             if self.use_chat_template:
                 stop_tokens = []
             else:
-                # NOTE: we are assuming all items in a batch behave similarly (same
-                # stop_tokens and max_tokens genrated) which is not necessarily
-                # the case! Because of that we only use batch size of 1
                 stop_tokens = dataset[0].stop_sequence
 
             max_new_tokens = dataset[0].generation_size  # could be none
@@ -267,17 +268,13 @@ def greedy_until(
                 stop_tokens=stop_tokens,
                 num_samples=num_samples,
             )
-
-            for i in range(len(sglang_outputs)):
-                sglang_output = sglang_outputs[i]
-                # print(sglang_output)
-                # exit(0)
+            
+            for input_token_ids, sglang_output in zip(inputs, sglang_outputs):
                 meta_info = sglang_output["meta_info"]
                 output_token_logprobs = meta_info["output_token_logprobs"]
                 output_token_ids = [output[1] for output in output_token_logprobs]
                 logprobs = [output[0] for output in output_token_logprobs]
                 result = [sglang_output["text"]]
-                input_token_ids = inputs[i]
         
                 cur_response = GenerativeResponse(
                     result=result,
@@ -298,31 +295,27 @@ def _generate(
         generate: bool = True,
     ) -> list[GenerativeResponse]:
         """Contains the actual logic of the generation."""
-        # TODO: double check without clone
-
-        params = dict(
-            top_p=1.0,
-            top_k=-1,
-            min_p=0,
-            max_new_tokens=max_new_tokens,
-            stop=stop_tokens,
-            temperature=1.0,
-            repetition_penalty=1.0,
-            skip_special_tokens=True,
-            spaces_between_special_tokens=True,
-            n=num_samples
-        )
-
-        if not generate:
-            params.temperature = 0
-            params.max_tokens = 1
+        # TODO: double check
+        
+        self.sampling_params["stop"] = stop_tokens
+        self.sampling_params["n"] = num_samples
+        self.sampling_params["top_p"] = 1.0
+        self.sampling_params["top_k"] = -1
+        self.sampling_params["skip_special_tokens"] = True
+
+        if generate:
+            self.sampling_params["temperature"] = 0.6
+            self.sampling_params["max_new_tokens"] = max_new_tokens
+        else:
+            self.sampling_params["temperature"] = 0
+            self.sampling_params["max_new_tokens"] = 1
 
         outputs = self.model.generate(
                 input_ids=inputs,
-                sampling_params=params,
+                sampling_params=self.sampling_params,
                 return_logprob=True,
             )
-
+        
         return outputs
 
     def loglikelihood(
diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py
index a052b97ba..051b088fe 100644
--- a/src/lighteval/models/vllm/vllm_model.py
+++ b/src/lighteval/models/vllm/vllm_model.py
@@ -71,7 +71,6 @@
 
 STARTING_BATCH_SIZE = 512
 
-## change to all sglang config
 @dataclass
 class VLLMModelConfig:
     pretrained: str
@@ -130,7 +129,7 @@ def __init__(
         self.model_info = ModelInfo(model_name=self.model_name, model_sha=self.model_sha)
         self.sampling_params = SamplingParams(**config.generation_parameters.to_vllm_openai_dict())
         self.pairwise_tokenization = config.pairwise_tokenization
-
+        
     @property
     def tokenizer(self):
         return self._tokenizer
@@ -276,7 +275,7 @@ def greedy_until(
                     )
                     context_size = self.max_length
                     inputs = [input[-context_size:] for input in inputs]
-
+            
             vllm_outputs = self._generate(
                 inputs=inputs,
                 max_new_tokens=max_new_tokens,
@@ -284,14 +283,14 @@ def greedy_until(
                 returns_logits=returns_logits,
                 num_samples=num_samples,
             )
-
+            
             for vllm_output in vllm_outputs:
                 output_token_ids = [outputs.token_ids for outputs in vllm_output.outputs]
                 logprobs = [output.logprobs for output in vllm_output.outputs] or []
                 logprobs = [logprob[token_id].logprob for token_id, logprob in zip(output_token_ids[0], logprobs[0])]
                 result = [output.text for output in vllm_output.outputs]
                 input_token_ids = vllm_output.prompt_token_ids
-
+                
                 cur_response = GenerativeResponse(
                     result=result,
                     logits=logprobs,
@@ -358,12 +357,13 @@ def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, r
                 sampling_params=sampling_params,
                 use_tqdm=True,
             )
-
+            
         return outputs
 
     def loglikelihood(
         self, requests: list[LoglikelihoodRequest], override_bs: Optional[int] = None
     ) -> list[LoglikelihoodResponse]:
+
         for request in requests:
             if request.context == "":
                 request.tokenized_context = [self.tokenizer.eos_token_id]
diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py
index 148c1849e..d71d6991a 100644
--- a/src/lighteval/pipeline.py
+++ b/src/lighteval/pipeline.py
@@ -151,7 +151,6 @@ def __init__(
         self.evaluation_tracker = evaluation_tracker
         self.accelerator, self.parallel_context = self._init_parallelism_manager()
         self.model = self._init_model(model_config, model)
-
         self.evaluation_tracker.general_config_logger.log_model_info(self.model.model_info)
         self._init_tasks_and_requests(tasks=tasks)
         self._init_random_seeds()
@@ -193,7 +192,6 @@ def _init_model(self, model_config, model):
                     env_config=self.pipeline_parameters.env_config,
                 )
             else:
-                ## Jayon02: load model into vllm
                 return load_model(config=model_config, env_config=self.pipeline_parameters.env_config)
 
         if isinstance(model, TransformersModel):
@@ -213,10 +211,10 @@ def _init_tasks_and_requests(self, tasks: str):
                 cache_dir=self.pipeline_parameters.env_config.cache_dir,
                 custom_tasks=self.pipeline_parameters.custom_tasks_directory,
             )
+
             task_names_list, fewshots_dict = taskinfo_selector(tasks, registry)
             task_dict = registry.get_task_dict(task_names_list)
             LightevalTask.load_datasets(list(task_dict.values()), self.pipeline_parameters.dataset_loading_processes)
-
             self.evaluation_tracker.task_config_logger.log(task_dict)
 
             requests, docs = create_requests_from_tasks(
@@ -451,6 +449,7 @@ def _run_model(self):
             responses = run_model(requests, override_bs=self.pipeline_parameters.override_batch_size)
 
             # Storing the responses associated to the same samples together
+
             for response, request in zip(responses, requests):
                 for metric_category in request.metric_categories:
                     sample_id = SampleUid(request.task_name, request.sample_index)
diff --git a/vllm_inputs_token.txt b/vllm_inputs_token.txt
deleted file mode 100644
index 76bbf4434..000000000
--- a/vllm_inputs_token.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-1,8789,13,1889,1984,15503,2472,5888,13,12,1270,1848,1783,28732,944,28725,2195,1148,1329,13,12,12,944,28723,8554,1148,327,2195,1148,13,1889,1984,3540,2472,28732,5183,15503,2472,1329,13,12,1270,1848,1783,6743,944,28725,1083,1148,1329,13,12,12,944,28723,1666,1148,327,1083,1148,13,28744,327,1984,15503,2472,28732,28782,28731,13,28724,327,1984,3540,2472,28732,28744,28723,8554,1148,28731,13,13940,28832,13,3195,349,272,1192,302,337,28723,1666,1148,438,272,948,302,456,2007,28804
-1,8789,13,28715,327,371,28740,28747,464,21558,647,28705,28750,28747,464,17664,647,28705,28770,28747,464,12684,3970,14491,13,1042,28730,327,9842,13,1392,613,297,2819,28732,2004,28732,28715,24770,13,12,335,613,1239,28705,28750,859,28705,28740,28747,13,12,12,1042,28730,2679,281,28792,28710,28793,13,13940,28832,13,3195,349,272,1192,302,1117,28730,1024,456,2007,4546,274,28804
-1,8789,13,1270,408,28732,28711,1329,13,12,335,307,859,28705,28740,28747,13,12,12,807,28705,28740,13,12,2013,28747,13,12,12,807,307,648,408,28732,28711,28733,28740,28731,13,28764,327,408,28732,28740,28734,28731,13,28724,327,408,6422,28740,28734,28731,13,13940,28832,13,3195,1212,302,5851,1235,456,2007,7072,28804
-1,8789,13,1889,1984,2472,5888,13,12,1270,1848,1783,6743,944,28725,2095,1329,13,12,12,944,28723,1148,327,2095,13,28744,327,1984,2472,28732,28782,28731,13,28724,327,1984,2472,28732,28744,28723,1148,28731,13,13940,28832,13,3195,349,272,1192,302,337,28723,1148,438,272,948,302,456,2007,28804
-1,8789,13,1889,1984,2472,5888,13,12,1270,1848,1783,6743,944,28725,2095,1329,13,12,12,944,28723,1148,327,2095,13,28744,327,1984,2472,28732,28782,28731,13,28724,327,1984,2472,28732,28744,28723,1148,28731,13,13940,28832,13,3195,349,272,1192,302,1318,28723,1148,438,272,948,302,456,2007,28804
-1,8789,13,1270,1369,28732,28744,1329,13,12,1392,613,297,2819,28732,28750,28725,1318,348,28732,28734,28723,28782,24770,13,12,12,335,1318,1239,716,28732,28710,28731,859,28705,28734,28747,13,12,12,12,807,8250,13,12,807,6110,13,13940,28832,13,3195,349,1369,28732,28740,28734,28740,11840
-1,8789,13,1270,285,28740,5888,13,12,807,1117,28732,28744,28731,648,464,21558,28742,13,1270,285,28750,5888,13,12,807,285,28740,28732,28750,28736,28744,28731,13,28744,327,285,28750,28732,28782,28750,28781,28731,13,13940,28832,13,3195,349,272,1192,302,1318,438,272,948,302,456,2007,28804
-1,8789,13,1270,285,28740,28732,28744,1329,13,12,28744,28792,28734,28793,2679,28705,28740,13,28724,327,285,28740,28732,28740,28731,13,28764,327,285,28750,5187,28740,2803,13,28712,327,285,28740,857,28775,1481,13,13940,28832,13,3195,1212,302,5851,1235,456,2007,7072,28804
-1,8789,13,1889,1984,2472,5888,13,12,1270,1848,1783,6743,944,28725,2095,1329,13,12,12,944,28723,1148,327,2095,13,28744,327,1984,2472,28732,28782,28731,13,28724,327,1984,2472,28732,28744,28723,1148,28731,13,13940,28832,13,3195,1212,302,1928,349,1318,28804
-1,8789,13,1801,327,28705,28734,13,1392,613,297,2819,28732,28740,28734,28734,28725,28705,28734,28725,387,28750,1329,13,12,1801,2679,613,13,13940,28832,13,3195,349,272,1192,302,1024,272,4008,727,1407,28705,28770,349,15731,28804
-1,8789,13,1270,18328,28732,28744,1329,13,12,807,1318,28736,28744,13,28724,327,18328,28732,28782,28731,13,28744,327,18328,28732,28724,28731,13,13940,28832,13,3195,349,272,1192,302,337,438,272,948,302,456,2007,28804
-1,8789,13,28744,327,28705,28782,13,28724,327,28705,28770,13,28764,327,28705,28787,13,28744,327,337,648,1318,13,13940,28832,13,3195,349,272,1192,302,1318,438,1407,28705,28750,28804
-1,8789,13,28744,327,28705,28740,13,335,1318,876,28705,28770,28747,13,12,28724,327,464,21558,28742,13,13940,28832,13,3195,349,272,1192,302,337,438,1407,28705,28781,28804
-1,8789,13,7841,25637,390,7494,13,28744,327,7494,28723,3506,385,28732,28740,28734,28731,13,13940,28832,13,3195,349,272,1192,302,1318,438,272,948,302,456,2007,28804
-1,8789,13,28724,327,1274,28732,2022,28732,501,28725,5936,28740,647,464,21558,647,464,28750,16433,13,13940,28832,13,3195,2118,1235,456,2007,7072,28804
-1,8789,13,335,1318,523,28705,28782,28747,13,12,4119,13,13940,28832,13,3195,2118,1235,456,2007,5439,28804
diff --git a/vllm_output.txt b/vllm_output.txt
deleted file mode 100644
index d4fc354cb..000000000
--- a/vllm_output.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-RequestOutput(request_id=0, 
-prompt=None, 
-prompt_token_ids=[1, 8789, 13, 1889, 1984, 15503, 2472, 5888, 13, 12, 1270, 1848, 1783, 28732, 944, 28725, 2195, 1148, 1329, 13, 12, 12, 944, 28723, 8554, 1148, 327, 2195, 1148, 13, 1889, 1984, 3540, 2472, 28732, 5183, 15503, 2472, 1329, 13, 12, 1270, 1848, 1783, 6743, 944, 28725, 1083, 1148, 1329, 13, 12, 12, 944, 28723, 1666, 1148, 327, 1083, 1148, 13, 28744, 327, 1984, 15503, 2472, 28732, 28782, 28731, 13, 28724, 327, 1984, 3540, 2472, 28732, 28744, 28723, 8554, 1148, 28731, 13, 13940, 28832, 13, 3195, 349, 272, 1192, 302, 337, 28723, 1666, 1148, 438, 272, 948, 302, 456, 2007, 28804], 
-encoder_prompt=None, 
-encoder_prompt_token_ids=None, prompt_logprobs=None, 
-outputs=[CompletionOutput(index=0, text='', token_ids=(13,), 
-  cumulative_logprob=-0.047923602163791656, 
-  logprobs=[{13: Logprob(logprob=-0.047923602163791656, rank=1, decoded_token='\n')}], 
-  finish_reason=stop, stop_reason=)], 
-finished=True, 
-metrics=RequestMetrics(arrival_time=1738946941.3884528, last_token_time=1738946941.3884528, first_scheduled_time=1738946941.3913035, first_token_time=1738946941.6071706, time_in_queue=0.002850770950317383, finished_time=1738946941.608071, scheduler_time=0.001762479543685913, model_forward_time=None, model_execute_time=None), 
-lora_request=None, num_cached_tokens=0)
diff --git a/vllm_sampling_para.txt b/vllm_sampling_para.txt
deleted file mode 100644
index 17aabb410..000000000
--- a/vllm_sampling_para.txt
+++ /dev/null
@@ -1 +0,0 @@
-SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=1.0, top_k=-1, min_p=0.0, seed=None, stop=['\n', '</s>'], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=100, min_tokens=0, logprobs=0, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None)
\ No newline at end of file

From 9c0f7cb0c7625824e93ce70505d73ae69e0927ac Mon Sep 17 00:00:00 2001
From: qiujiang chen <qiujiang@lmsys.us-northcentral1-a.compute.internal>
Date: Wed, 12 Feb 2025 05:26:05 +0000
Subject: [PATCH 07/10] adjust precision

---
 src/lighteval/models/sglang/sglang_model.py | 39 ++-------------------
 src/lighteval/models/vllm/vllm_model.py     |  3 --
 2 files changed, 2 insertions(+), 40 deletions(-)

diff --git a/src/lighteval/models/sglang/sglang_model.py b/src/lighteval/models/sglang/sglang_model.py
index 7bf3b999c..52cb2ec29 100644
--- a/src/lighteval/models/sglang/sglang_model.py
+++ b/src/lighteval/models/sglang/sglang_model.py
@@ -73,7 +73,6 @@ class SGLANGModelConfig:
     context_length: int | None = None
     random_seed: Optional[int] = 1234
     trust_remote_code: bool = False
-    chat_template: Optional[str] = None # no use
     use_chat_template: bool = False
     device: str = "cuda"
     skip_tokenizer_init: bool = False
@@ -112,39 +111,10 @@ def __init__(
     def tokenizer(self):
         return self._tokenizer
 
-    def cleanup(self):
-        
-        def reap_children(signum, frame):
-            try:
-                while True:
-                    pid, status = os.waitpid(-1, os.WNOHANG)
-                    if pid == 0:
-                        break
-                    print(f"Reaped child process {pid} with status {status}")
-            except ChildProcessError:
-                pass
-
-        signal.signal(signal.SIGCHLD, reap_children)
-        
-        
+    def cleanup(self):        
         destroy_model_parallel()
         if self.model is not None:
             self.model.shutdown()
-            result = subprocess.run(["nvidia-smi", "--query-compute-apps=pid,process_name,gpu_uuid",
-                "--format=csv,noheader,nounits"], capture_output=True, text=True)
-            lines = result.stdout.strip().split("\n")
-            target_pids = []
-
-            for line in lines:
-                parts = [p.strip() for p in line.split(",")]
-                if len(parts) < 2:
-                    continue
-                pid, process_name = parts[:2]
-                if process_name == "sglang::scheduler":
-                    target_pids.append(pid)
-                    
-            for pid in target_pids:
-                os.kill(int(pid), 9)
 
         self.model = None
         gc.collect()
@@ -275,7 +245,6 @@ def greedy_until(
                 output_token_ids = [output[1] for output in output_token_logprobs]
                 logprobs = [output[0] for output in output_token_logprobs]
                 result = [sglang_output["text"]]
-        
                 cur_response = GenerativeResponse(
                     result=result,
                     logits=logprobs,
@@ -296,26 +265,22 @@ def _generate(
     ) -> list[GenerativeResponse]:
         """Contains the actual logic of the generation."""
         # TODO: double check
-        
         self.sampling_params["stop"] = stop_tokens
         self.sampling_params["n"] = num_samples
         self.sampling_params["top_p"] = 1.0
         self.sampling_params["top_k"] = -1
         self.sampling_params["skip_special_tokens"] = True
+        self.sampling_params["temperature"] = 0
 
         if generate:
-            self.sampling_params["temperature"] = 0.6
             self.sampling_params["max_new_tokens"] = max_new_tokens
         else:
-            self.sampling_params["temperature"] = 0
             self.sampling_params["max_new_tokens"] = 1
-
         outputs = self.model.generate(
                 input_ids=inputs,
                 sampling_params=self.sampling_params,
                 return_logprob=True,
             )
-        
         return outputs
 
     def loglikelihood(
diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py
index 051b088fe..247e477d1 100644
--- a/src/lighteval/models/vllm/vllm_model.py
+++ b/src/lighteval/models/vllm/vllm_model.py
@@ -290,7 +290,6 @@ def greedy_until(
                 logprobs = [logprob[token_id].logprob for token_id, logprob in zip(output_token_ids[0], logprobs[0])]
                 result = [output.text for output in vllm_output.outputs]
                 input_token_ids = vllm_output.prompt_token_ids
-                
                 cur_response = GenerativeResponse(
                     result=result,
                     logits=logprobs,
@@ -357,13 +356,11 @@ def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, r
                 sampling_params=sampling_params,
                 use_tqdm=True,
             )
-            
         return outputs
 
     def loglikelihood(
         self, requests: list[LoglikelihoodRequest], override_bs: Optional[int] = None
     ) -> list[LoglikelihoodResponse]:
-
         for request in requests:
             if request.context == "":
                 request.tokenized_context = [self.tokenizer.eos_token_id]

From b733c86686b1d1f15ad4f210fb2b36d232d70bf4 Mon Sep 17 00:00:00 2001
From: Jayon02 <12012211@mail.sustech.edu.cn>
Date: Sat, 15 Feb 2025 11:23:39 +0000
Subject: [PATCH 08/10] Squashed commit of the following:

commit 132290b571b7470ffec04461ce504eb352b031a3
Author: Jayon02 <12012211@mail.sustech.edu.cn>
Date:   Sat Feb 15 11:08:24 2025 +0000

    modify document

commit 601a75504be8320703af401eaddce2ceef40f3ca
Author: Jayon02 <12012211@mail.sustech.edu.cn>
Date:   Sat Feb 15 10:22:43 2025 +0000

    pass pre commit check and modify document

commit 3e1fb8899c073622f76c14b7a12c392cfe426a37
Author: qiujiang chen <qiujiang@lmsys.us-northcentral1-a.compute.internal>
Date:   Sat Feb 15 06:59:12 2025 +0000

    optimize input, adjust precision

commit 1a590760aa5dd188d2e31e3f6ee316433004614d
Author: qiujiang chen <qiujiang@lmsys.us-northcentral1-a.compute.internal>
Date:   Thu Feb 13 19:51:22 2025 +0000

    text files

commit 9dc62b7f55a190e42cd46016a12358c1f1b78c93
Author: qiujiang chen <qiujiang@lmsys.us-northcentral1-a.compute.internal>
Date:   Wed Feb 12 14:08:21 2025 +0000

    modify format
---
 docs/source/_toctree.yml                    |   2 +
 docs/source/installation.mdx                |   3 +
 docs/source/use-sglang-as-backend.mdx       |  52 ++++++++
 src/lighteval/__main__.py                   |   2 +-
 src/lighteval/main_sglang.py                |  25 +++-
 src/lighteval/main_vllm.py                  |   1 -
 src/lighteval/models/model_input.py         |  29 ++--
 src/lighteval/models/model_loader.py        |  17 +--
 src/lighteval/models/sglang/sglang_model.py | 138 +++++++++++++-------
 src/lighteval/models/vllm/vllm_model.py     |  10 +-
 src/lighteval/pipeline.py                   |   9 +-
 src/lighteval/tasks/registry.py             |   1 -
 src/lighteval/utils/imports.py              |   5 +-
 13 files changed, 211 insertions(+), 83 deletions(-)
 create mode 100644 docs/source/use-sglang-as-backend.mdx

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 9ad55466a..2a56f1eb5 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -17,6 +17,8 @@
     title: Add a custom metric
   - local: use-vllm-as-backend
     title: Use VLLM as backend
+  - local: use-sglang-as-backend
+    title: Use SGLang as backend
   - local: evaluate-the-model-on-a-server-or-container
     title: Evaluate on Server
   - local: contributing-to-multilingual-evaluations
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 542c09752..a57bcf6b7 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -23,6 +23,8 @@ Lighteval has optional dependencies that you can install by specifying the
 appropriate extras group.
 `pip install lighteval[<group>]` or `pip install -e .[<group>]`.
 
+If you want to use lighteval with `sglang`, try to follow [sglang install documentation](https://docs.sglang.ai/start/install.html).
+
 | extra name   | description                                                               |
 |--------------|---------------------------------------------------------------------------|
 | tgi          | To use Text Generation Inference API to evaluate your model               |
@@ -31,6 +33,7 @@ appropriate extras group.
 | adapters     | To evaluate adapters models (delta and peft)                              |
 | tensorboardX | To upload your results to tensorboard                                     |
 | vllm         | To use vllm as backend for inference                                      |
+| sglang       | To use sglang as backend for inference                                    |
 | s3           | To upload results to s3                                                   |
 
 
diff --git a/docs/source/use-sglang-as-backend.mdx b/docs/source/use-sglang-as-backend.mdx
new file mode 100644
index 000000000..2edcaeeb5
--- /dev/null
+++ b/docs/source/use-sglang-as-backend.mdx
@@ -0,0 +1,52 @@
+# Use SGLang as backend
+
+Lighteval allows you to use `sglang` as backend allowing great speedups.
+To use, simply change the `model_args` to reflect the arguments you want to pass to sglang.
+
+```bash
+lighteval sglang \
+    "pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16" \
+    "leaderboard|truthfulqa:mc|0|0"
+```
+
+`sglang` is able to distribute the model across multiple GPUs using data
+parallelism and tensor parallelism.
+You can choose the parallelism method by setting in the the `model_args`.
+
+For example if you have 4 GPUs you can split it across using `tensor_parallelism`:
+
+```bash
+lighteval sglang \
+    "pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16,tp_size=4" \
+    "leaderboard|truthfulqa:mc|0|0"
+```
+
+Or, if your model fits on a single GPU, you can use `data_parallelism` to speed up the evaluation:
+
+```bash
+lighteval sglang \
+    "pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16,dp_size=4" \
+    "leaderboard|truthfulqa:mc|0|0"
+```
+
+Available arguments for `sglang` can be found in the `SGLangModelConfig`:
+
+- **pretrained** (str): HuggingFace Hub model ID name or the path to a pre-trained model to load.
+- **load_format** (str): The format the weights are loaded in. Defaults to *.safetensors/*.bin.
+- **dtype** (str): Dtype used for the model, defaults to bfloat16.
+- **tp_size** (int): The number of GPUs the model weights get sharded over.
+- **dp_size** (int): The number of data-parallel copies of the model.
+- **context_length** (int | None): The number of tokens our model can process including the input.
+- **random_seed** (int): Can be used to enforce more deterministic behavior.
+- **trust_remote_code** (bool): If True, will use locally cached config files, otherwise use remote configs in HuggingFace.
+- **skip_tokenizer_init** (bool): Set to true to provide the tokens to the engine and get the output tokens directly, typically used in RLHF.
+- **kv_cache_dtype** (str): Dtype of the kv cache, defaults to the auto.
+- **add_special_tokens** (bool): Whether to add special tokens to the input sequences.
+- **sampling_backend** (str | None): The backend for sampling.
+- **attention_backend** (str | None): The backend for attention computation and KV cache management.
+- **mem_fraction_static** (float): Fraction of the free GPU memory used for static memory like model weights and KV cache.
+- **chunked_prefill_size** (int): Perform the prefill in chunks of these size.
+
+> [!WARNING]
+> In the case of OOM issues, you might need to reduce the context size of the
+> model as well as reduce the `mem_fraction_static` and `chunked_prefill_size` parameter.
diff --git a/src/lighteval/__main__.py b/src/lighteval/__main__.py
index 1e79188dd..eece09f4c 100644
--- a/src/lighteval/__main__.py
+++ b/src/lighteval/__main__.py
@@ -29,9 +29,9 @@
 import lighteval.main_baseline
 import lighteval.main_endpoint
 import lighteval.main_nanotron
+import lighteval.main_sglang
 import lighteval.main_tasks
 import lighteval.main_vllm
-import lighteval.main_sglang
 
 
 app = typer.Typer()
diff --git a/src/lighteval/main_sglang.py b/src/lighteval/main_sglang.py
index 539867105..a49840abe 100644
--- a/src/lighteval/main_sglang.py
+++ b/src/lighteval/main_sglang.py
@@ -1,6 +1,6 @@
 # MIT License
 
-# Copyright (c) 2024 The HuggingFace Team
+# Copyright (c) 2024 The SGLang Team
 
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -19,7 +19,9 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
+import json
 import os
+import re
 from typing import Optional
 
 from typer import Argument, Option
@@ -34,6 +36,7 @@
 HELP_PANEL_NAME_3 = "Debug Parameters"
 HELP_PANEL_NAME_4 = "Modeling Parameters"
 
+
 def sglang(
     # === general ===
     model_args: Annotated[
@@ -99,7 +102,7 @@ def sglang(
 
     from lighteval.logging.evaluation_tracker import EvaluationTracker
     from lighteval.models.model_input import GenerationParameters
-    from lighteval.models.sglang.sglang_model import SGLANGModelConfig
+    from lighteval.models.sglang.sglang_model import SGLangModelConfig
     from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters
 
     TOKEN = os.getenv("HF_TOKEN")
@@ -133,13 +136,23 @@ def sglang(
         with open(model_args, "r") as f:
             config = yaml.safe_load(f)["model"]
         generation_parameters = GenerationParameters.from_dict(config)
-        model_config = SGLANGModelConfig(config, generation_parameters=generation_parameters)
+        model_config = SGLangModelConfig(config, generation_parameters=generation_parameters)
 
     else:
-        ## cmd arg
-        model_args_dict: dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in model_args.split(",")}
+        pattern = re.compile(r"(\w+)=(\{.*\}|[^,]+)")
+        matches = pattern.findall(model_args)
+        model_args_dict = {}
+        generation_params = None
+        for key, value in matches:
+            key = key.strip()
+            if key == "generation_parameters":
+                value = re.sub(r"(\w+):", r'"\1":', value)
+                value = json.loads(value)
+                generation_params = GenerationParameters(**value)
+            else:
+                model_args_dict[key] = value
+        model_config = SGLangModelConfig(**model_args_dict, generation_parameters=generation_params)
 
-        model_config = SGLANGModelConfig(**model_args_dict)
     pipeline = Pipeline(
         tasks=tasks,
         pipeline_parameters=pipeline_params,
diff --git a/src/lighteval/main_vllm.py b/src/lighteval/main_vllm.py
index 130b5ff56..d063c3fa8 100644
--- a/src/lighteval/main_vllm.py
+++ b/src/lighteval/main_vllm.py
@@ -137,7 +137,6 @@ def vllm(
         model_config = VLLMModelConfig(config, generation_parameters=generation_parameters)
 
     else:
-        ## cmd arg
         model_args_dict: dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in model_args.split(",")}
         model_config = VLLMModelConfig(**model_args_dict)
 
diff --git a/src/lighteval/models/model_input.py b/src/lighteval/models/model_input.py
index c91d9fbc9..21e8496a4 100644
--- a/src/lighteval/models/model_input.py
+++ b/src/lighteval/models/model_input.py
@@ -27,20 +27,20 @@
 @dataclass
 class GenerationParameters:
     early_stopping: Optional[bool] = None  # vllm, transformers
-    repetition_penalty: Optional[float] = None  # vllm, transformers, tgi
-    frequency_penalty: Optional[float] = None  # vllm, tgi
+    repetition_penalty: Optional[float] = None  # vllm, transformers, tgi, sglang
+    frequency_penalty: Optional[float] = None  # vllm, tgi, sglang
     length_penalty: Optional[float] = None  # vllm, transformers
-    presence_penalty: Optional[float] = None  # vllm
+    presence_penalty: Optional[float] = None  # vllm, sglang
 
-    max_new_tokens: Optional[int] = None  # vllm, transformers, tgi
-    min_new_tokens: Optional[int] = None  # vllm, transformers
+    max_new_tokens: Optional[int] = None  # vllm, transformers, tgi, sglang
+    min_new_tokens: Optional[int] = None  # vllm, transformers, sglang
 
     seed: Optional[int] = None  # vllm, tgi
-    stop_tokens: Optional[list[str]] = None  # vllm, transformers, tgi
-    temperature: Optional[float] = None  # vllm, transformers, tgi
-    top_k: Optional[int] = None  # vllm, transformers, tgi
-    min_p: Optional[float] = None  # vllm, transformers
-    top_p: Optional[int] = None  # vllm, transformers, tgi
+    stop_tokens: Optional[list[str]] = None  # vllm, transformers, tgi, sglang
+    temperature: Optional[float] = None  # vllm, transformers, tgi, sglang
+    top_k: Optional[int] = None  # vllm, transformers, tgi, sglang
+    min_p: Optional[float] = None  # vllm, transformers, sglang
+    top_p: Optional[int] = None  # vllm, transformers, tgi, sglang
     truncate_prompt: Optional[bool] = None  # vllm, tgi
 
     @classmethod
@@ -125,6 +125,13 @@ def to_sglang_dict(self) -> dict:
             "max_new_tokens": self.max_new_tokens,
             "stop_token_ids": self.stop_tokens,
             "temperature": self.temperature,
+            "stop": self.stop_tokens,
+            "top_p": self.top_p,
+            "top_k": self.top_k,
+            "min_p": self.min_p,
+            "frequency_penalty": self.frequency_penalty,
+            "presence_penalty": self.presence_penalty,
+            "repetition_penalty": self.repetition_penalty,
+            "min_new_tokens": self.min_new_tokens,
         }
         return {k: v for k, v in args.items() if v is not None}
-
diff --git a/src/lighteval/models/model_loader.py b/src/lighteval/models/model_loader.py
index 3bf76af7d..ebcd9d3bb 100644
--- a/src/lighteval/models/model_loader.py
+++ b/src/lighteval/models/model_loader.py
@@ -32,19 +32,21 @@
 from lighteval.models.endpoints.openai_model import OpenAIClient, OpenAIModelConfig
 from lighteval.models.endpoints.tgi_model import ModelClient, TGIModelConfig
 from lighteval.models.litellm_model import LiteLLMClient, LiteLLMModelConfig
-from lighteval.models.sglang.sglang_model import SGLANGModelConfig, SGLANGModel
+from lighteval.models.sglang.sglang_model import SGLangModel, SGLangModelConfig
 from lighteval.models.transformers.adapter_model import AdapterModel, AdapterModelConfig
 from lighteval.models.transformers.delta_model import DeltaModel, DeltaModelConfig
 from lighteval.models.transformers.transformers_model import TransformersModel, TransformersModelConfig
 from lighteval.models.vllm.vllm_model import VLLMModel, VLLMModelConfig
 from lighteval.utils.imports import (
     NO_LITELLM_ERROR_MSG,
+    NO_SGLANG_ERROR_MSG,
     NO_TGI_ERROR_MSG,
     NO_VLLM_ERROR_MSG,
     is_litellm_available,
     is_openai_available,
+    is_sglang_available,
     is_tgi_available,
-    is_vllm_available, is_sglang_available, NO_SGLANG_ERROR_MSG,
+    is_vllm_available,
 )
 from lighteval.utils.utils import EnvConfig
 
@@ -63,7 +65,7 @@ def load_model(  # noqa: C901
         VLLMModelConfig,
         OpenAIModelConfig,
         LiteLLMModelConfig,
-        SGLANGModelConfig,
+        SGLangModelConfig,
     ],
     env_config: EnvConfig,
 ) -> Union[TransformersModel, AdapterModel, DeltaModel, ModelClient, DummyModel]:
@@ -98,9 +100,7 @@ def load_model(  # noqa: C901
     if isinstance(config, VLLMModelConfig):
         return load_model_with_accelerate_or_default(config=config, env_config=env_config)
 
-    if isinstance(config, SGLANGModelConfig):
-        # TODO: double check
-        # return load_model_with_accelerate_or_default(config=config, env_config=env_config)
+    if isinstance(config, SGLangModelConfig):
         return load_sglang_model(config=config, env_config=env_config)
 
     if isinstance(config, OpenAIModelConfig):
@@ -167,8 +167,9 @@ def load_model_with_accelerate_or_default(
 def load_dummy_model(config: DummyModelConfig, env_config: EnvConfig):
     return DummyModel(config=config, env_config=env_config)
 
-def load_sglang_model(config: SGLANGModelConfig, env_config: EnvConfig):
+
+def load_sglang_model(config: SGLangModelConfig, env_config: EnvConfig):
     if not is_sglang_available():
         raise ImportError(NO_SGLANG_ERROR_MSG)
 
-    return SGLANGModel(config=config, env_config=env_config)
+    return SGLangModel(config=config, env_config=env_config)
diff --git a/src/lighteval/models/sglang/sglang_model.py b/src/lighteval/models/sglang/sglang_model.py
index 52cb2ec29..9bb8af7bd 100644
--- a/src/lighteval/models/sglang/sglang_model.py
+++ b/src/lighteval/models/sglang/sglang_model.py
@@ -1,6 +1,6 @@
 # MIT License
 
-# Copyright (c) 2024 The HuggingFace Team
+# Copyright (c) 2024 The SGLang Team
 
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -22,9 +22,6 @@
 
 import gc
 import logging
-import os
-import subprocess
-import signal
 from dataclasses import dataclass
 from typing import Optional
 
@@ -46,25 +43,22 @@
 from lighteval.utils.imports import is_sglang_available
 from lighteval.utils.utils import EnvConfig, as_list
 
+
 logger = logging.getLogger(__name__)
 
 if is_sglang_available():
     from sglang import Engine
     from sglang.srt.hf_transformers_utils import get_tokenizer
-    from sglang.srt.distributed.parallel_state import destroy_model_parallel, destroy_distributed_environment
-    
+
     logging.getLogger("sglang").propagate = True
     logging.getLogger("sglang").handlers.clear()
 else:
     Engine = None
     get_tokenizer = None
 
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-STARTING_BATCH_SIZE = 512
 
 @dataclass
-class SGLANGModelConfig:
+class SGLangModelConfig:
     pretrained: str
     load_format: str = "auto"
     dtype: str = "auto"
@@ -78,18 +72,22 @@ class SGLANGModelConfig:
     skip_tokenizer_init: bool = False
     kv_cache_dtype: str = "auto"
     add_special_tokens: bool = True
-    pipeline_parallel_size: int = 1  # how many GPUs to use for pipeline parallelism
-
+    pairwise_tokenization: bool = False
+    sampling_backend: str | None = None
+    attention_backend: str = None
+    mem_fraction_static: float = 0.8
+    chunked_prefill_size: int = 4096
     generation_parameters: GenerationParameters = None
-    
+
     def __post_init__(self):
         if not self.generation_parameters:
             self.generation_parameters = GenerationParameters()
 
-class SGLANGModel(LightevalModel):
+
+class SGLangModel(LightevalModel):
     def __init__(
         self,
-        config: SGLANGModelConfig,
+        config: SGLangModelConfig,
         env_config: EnvConfig,
     ):
         """Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation."""
@@ -106,19 +104,20 @@ def __init__(
         self.precision = _get_dtype(config.dtype, config=self._config)
         self.sampling_params = config.generation_parameters.to_sglang_dict()
         self.model_info = ModelInfo(model_name=self.model_name, model_sha=self.model_sha)
-        
+        self.sampling_backend = config.sampling_backend
+        self.attention_backend = config.attention_backend
+        self.pairwise_tokenization = config.pairwise_tokenization
+
     @property
     def tokenizer(self):
         return self._tokenizer
 
-    def cleanup(self):        
-        destroy_model_parallel()
+    def cleanup(self):
         if self.model is not None:
             self.model.shutdown()
 
         self.model = None
         gc.collect()
-        destroy_distributed_environment()
         torch.cuda.empty_cache()
 
     @property
@@ -129,33 +128,33 @@ def add_special_tokens(self):
     def max_length(self) -> int:
         return self._max_length
 
-    def _create_auto_model(self, config: SGLANGModelConfig, env_config: EnvConfig) -> Optional[Engine]:
-
-        # TODO: double check
-        self.model_args  = {
+    def _create_auto_model(self, config: SGLangModelConfig, env_config: EnvConfig) -> Optional[Engine]:
+        self.model_args = {
             "model_path": config.pretrained,
             "trust_remote_code": config.trust_remote_code,
             "dtype": config.dtype,
             "device": "cuda",
             "random_seed": config.random_seed,
             "load_format": config.load_format,
-            "context_length": int(self._max_length) if self._max_length else None,
+            "context_length": int(self._max_length) if self._max_length else 8192,
             "dp_size": int(config.dp_size),
             "tp_size": int(config.tp_size),
-            "log_level": "info",
+            "sampling_backend": config.sampling_backend,
+            "attention_backend": config.attention_backend,
+            "mem_fraction_static": float(config.mem_fraction_static),
+            "schedule_policy": "fcfs",
+            "chunked_prefill_size": int(config.chunked_prefill_size),
+            "disable_radix_cache": True,
         }
 
-        if config.dp_size > 1:
-            pass
-
         model = Engine(**self.model_args)
 
         if self._max_length is None:
-           self._max_length = 8192
+            self._max_length = 8192
 
         return model
 
-    def _create_auto_tokenizer(self, config: SGLANGModelConfig, env_config: EnvConfig):
+    def _create_auto_tokenizer(self, config: SGLangModelConfig, env_config: EnvConfig):
         tokenizer = get_tokenizer(
             config.pretrained,
             tokenizer_mode="auto",
@@ -194,14 +193,12 @@ def greedy_until(
             position=0,
             disable=False,
         ):
-            
             if self.use_chat_template:
                 stop_tokens = []
             else:
                 stop_tokens = dataset[0].stop_sequence
 
             max_new_tokens = dataset[0].generation_size  # could be none
-            returns_logits = dataset[0].use_logits
             num_samples = dataset[0].num_samples
 
             context = [c.context for c in dataset]
@@ -212,7 +209,7 @@ def greedy_until(
             # of losing some meaning, or have some generations that are exceedingly short?
             # The choice we go for here is to avoid truncating the prompt if we can, since it
             # should have been managed by the prompt creator/few shot manager if requested by the user.
-            
+
             inputs = tokenized["input_ids"]
             context_size = len(inputs[0])
 
@@ -238,7 +235,7 @@ def greedy_until(
                 stop_tokens=stop_tokens,
                 num_samples=num_samples,
             )
-            
+
             for input_token_ids, sglang_output in zip(inputs, sglang_outputs):
                 meta_info = sglang_output["meta_info"]
                 output_token_logprobs = meta_info["output_token_logprobs"]
@@ -252,7 +249,6 @@ def greedy_until(
                     input_tokens=input_token_ids,
                 )
                 results.append(cur_response)
-
         return dataset.get_original_order(results)
 
     def _generate(
@@ -264,29 +260,77 @@ def _generate(
         generate: bool = True,
     ) -> list[GenerativeResponse]:
         """Contains the actual logic of the generation."""
-        # TODO: double check
-        self.sampling_params["stop"] = stop_tokens
-        self.sampling_params["n"] = num_samples
-        self.sampling_params["top_p"] = 1.0
-        self.sampling_params["top_k"] = -1
-        self.sampling_params["skip_special_tokens"] = True
-        self.sampling_params["temperature"] = 0
 
+        logprob_start_len = None
+        top_logprobs_num = None
         if generate:
             self.sampling_params["max_new_tokens"] = max_new_tokens
+            self.sampling_params["stop"] = stop_tokens
+            self.sampling_params["n"] = num_samples
         else:
             self.sampling_params["max_new_tokens"] = 1
+            self.sampling_params["temperature"] = 0
+            logprob_start_len = 0
+            top_logprobs_num = 1
+
         outputs = self.model.generate(
-                input_ids=inputs,
-                sampling_params=self.sampling_params,
-                return_logprob=True,
-            )
+            input_ids=inputs,
+            sampling_params=self.sampling_params,
+            return_logprob=True,
+            logprob_start_len=logprob_start_len,
+            top_logprobs_num=top_logprobs_num,
+        )
         return outputs
 
     def loglikelihood(
         self, requests: list[LoglikelihoodRequest], override_bs: Optional[int] = None
     ) -> list[LoglikelihoodResponse]:
-        pass
+        for request in requests:
+            if request.context == "":
+                request.tokenized_context = [self.tokenizer.eos_token_id]
+                request.tokenized_continuation = self.tok_encode(request.choice)
+            else:
+                # The following line is mandatory for compatibility with the harness
+                request.tokenized_context, request.tokenized_continuation = self.tok_encode_pair(
+                    request.context, request.choice, pairwise=self.pairwise_tokenization
+                )
+
+        return self._loglikelihood_tokens(requests, override_bs=override_bs)
+
+    def _loglikelihood_tokens(
+        self,
+        requests: list[LoglikelihoodRequest],
+        override_bs: int = -1,
+        return_bool_score: bool = True,
+        rolling: bool = False,
+    ) -> list[LoglikelihoodResponse]:
+        dataset = LoglikelihoodDataset(requests=requests, num_dataset_splits=1)
+        res = []
+
+        for _ in tqdm(dataset.splits_start_end_iterator(), disable=False):
+            # the last token is an eos token, so we don't need to add it
+            inputs = [dataset[i].tokenized_context + dataset[i].tokenized_continuation for i in range(len(dataset))]
+            # Left truncate the inputs to the maximum length
+            inputs = [input[-self.max_length :] for input in inputs]
+            outputs = self._generate(inputs, generate=False)
+
+            for output, input in zip(outputs, dataset):
+                continuation_logprobs = []
+                meta_info = output["meta_info"]
+                input_token_logprobs = meta_info["input_token_logprobs"][::-1]
+                input_top_logprobs = meta_info["input_top_logprobs"][::-1]
+                input_top_logprobs = input_top_logprobs[: len(input.tokenized_continuation)]
+                continuation_logprobs.append(input_token_logprobs[: len(input.tokenized_continuation)])
+                bool_score = all(
+                    top[0][1] == input[1] for top, input in zip(input_top_logprobs, continuation_logprobs[0])
+                )
+                answer = LoglikelihoodResponse(
+                    input_tokens=input.tokenized_context + input.tokenized_continuation,
+                    generated_tokens=input.tokenized_continuation,
+                    result=(sum(item[0] for item in continuation_logprobs[0]), bool_score),
+                )
+                res.append(answer)
+        return dataset.get_original_order(res)
 
     def loglikelihood_rolling():
         pass
diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py
index 247e477d1..0bb13a6f1 100644
--- a/src/lighteval/models/vllm/vllm_model.py
+++ b/src/lighteval/models/vllm/vllm_model.py
@@ -48,6 +48,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 if is_vllm_available():
     import ray
     from more_itertools import distribute
@@ -71,6 +72,7 @@
 
 STARTING_BATCH_SIZE = 512
 
+
 @dataclass
 class VLLMModelConfig:
     pretrained: str
@@ -129,7 +131,7 @@ def __init__(
         self.model_info = ModelInfo(model_name=self.model_name, model_sha=self.model_sha)
         self.sampling_params = SamplingParams(**config.generation_parameters.to_vllm_openai_dict())
         self.pairwise_tokenization = config.pairwise_tokenization
-        
+
     @property
     def tokenizer(self):
         return self._tokenizer
@@ -275,7 +277,7 @@ def greedy_until(
                     )
                     context_size = self.max_length
                     inputs = [input[-context_size:] for input in inputs]
-            
+
             vllm_outputs = self._generate(
                 inputs=inputs,
                 max_new_tokens=max_new_tokens,
@@ -283,13 +285,14 @@ def greedy_until(
                 returns_logits=returns_logits,
                 num_samples=num_samples,
             )
-            
+
             for vllm_output in vllm_outputs:
                 output_token_ids = [outputs.token_ids for outputs in vllm_output.outputs]
                 logprobs = [output.logprobs for output in vllm_output.outputs] or []
                 logprobs = [logprob[token_id].logprob for token_id, logprob in zip(output_token_ids[0], logprobs[0])]
                 result = [output.text for output in vllm_output.outputs]
                 input_token_ids = vllm_output.prompt_token_ids
+
                 cur_response = GenerativeResponse(
                     result=result,
                     logits=logprobs,
@@ -356,6 +359,7 @@ def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, r
                 sampling_params=sampling_params,
                 use_tqdm=True,
             )
+
         return outputs
 
     def loglikelihood(
diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py
index d71d6991a..649bdcf8b 100644
--- a/src/lighteval/pipeline.py
+++ b/src/lighteval/pipeline.py
@@ -51,13 +51,15 @@
     NO_ACCELERATE_ERROR_MSG,
     NO_NANOTRON_ERROR_MSG,
     NO_OPENAI_ERROR_MSG,
+    NO_SGLANG_ERROR_MSG,
     NO_TGI_ERROR_MSG,
     NO_VLLM_ERROR_MSG,
     is_accelerate_available,
     is_nanotron_available,
     is_openai_available,
+    is_sglang_available,
     is_tgi_available,
-    is_vllm_available, is_sglang_available, NO_SGLANG_ERROR_MSG,
+    is_vllm_available,
 )
 from lighteval.utils.parallelism import test_all_gather
 from lighteval.utils.utils import EnvConfig, make_results_table
@@ -151,6 +153,7 @@ def __init__(
         self.evaluation_tracker = evaluation_tracker
         self.accelerator, self.parallel_context = self._init_parallelism_manager()
         self.model = self._init_model(model_config, model)
+
         self.evaluation_tracker.general_config_logger.log_model_info(self.model.model_info)
         self._init_tasks_and_requests(tasks=tasks)
         self._init_random_seeds()
@@ -193,7 +196,6 @@ def _init_model(self, model_config, model):
                 )
             else:
                 return load_model(config=model_config, env_config=self.pipeline_parameters.env_config)
-
         if isinstance(model, TransformersModel):
             return model
         else:
@@ -211,10 +213,10 @@ def _init_tasks_and_requests(self, tasks: str):
                 cache_dir=self.pipeline_parameters.env_config.cache_dir,
                 custom_tasks=self.pipeline_parameters.custom_tasks_directory,
             )
-
             task_names_list, fewshots_dict = taskinfo_selector(tasks, registry)
             task_dict = registry.get_task_dict(task_names_list)
             LightevalTask.load_datasets(list(task_dict.values()), self.pipeline_parameters.dataset_loading_processes)
+
             self.evaluation_tracker.task_config_logger.log(task_dict)
 
             requests, docs = create_requests_from_tasks(
@@ -449,7 +451,6 @@ def _run_model(self):
             responses = run_model(requests, override_bs=self.pipeline_parameters.override_batch_size)
 
             # Storing the responses associated to the same samples together
-
             for response, request in zip(responses, requests):
                 for metric_category in request.metric_categories:
                     sample_id = SampleUid(request.task_name, request.sample_index)
diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py
index 71a8c2bb1..174a98d33 100644
--- a/src/lighteval/tasks/registry.py
+++ b/src/lighteval/tasks/registry.py
@@ -299,7 +299,6 @@ def taskinfo_selector(tasks: str, task_registry: Registry) -> tuple[list[str], d
         expanded_tasks = task_registry.task_groups_dict.get(maybe_task_group, [maybe_task_group])
         expanded_tasks_list.extend(expanded_tasks)
 
-    ## task expand and few shot number record, not load task
     for task in expanded_tasks_list:
         try:
             suite_name, task_name, few_shot, truncate_few_shots = tuple(task.split("|"))
diff --git a/src/lighteval/utils/imports.py b/src/lighteval/utils/imports.py
index 14e3b94da..5a007c95f 100644
--- a/src/lighteval/utils/imports.py
+++ b/src/lighteval/utils/imports.py
@@ -87,14 +87,17 @@ def is_litellm_available() -> bool:
 def is_vllm_available() -> bool:
     return importlib.util.find_spec("vllm") is not None and importlib.util.find_spec("ray") is not None
 
+
 NO_VLLM_ERROR_MSG = "You are trying to use an VLLM model, for which you need `vllm` and `ray`, which are not available in your environment. Please install them using pip, `pip install vllm ray`."
 
-# TODO： need review
+
 def is_sglang_available() -> bool:
     return importlib.util.find_spec("sglang") is not None and importlib.util.find_spec("flashinfer") is not None
 
+
 NO_SGLANG_ERROR_MSG = "You are trying to use an sglang model, for which you need `sglang` and `flashinfer`, which are not available in your environment. Please install them using pip, `pip install vllm ray`."
 
+
 def can_load_extended_tasks() -> bool:
     imports = []
     for package in ["langdetect", "openai"]:

From 235836012db1f3a9677dc460b8da68ff30ba9f0f Mon Sep 17 00:00:00 2001
From: Qiujiang Chen <12012211@mail.sustech.edu.cn>
Date: Sat, 15 Feb 2025 19:33:44 +0800
Subject: [PATCH 09/10] Update use-sglang-as-backend.mdx

---
 docs/source/use-sglang-as-backend.mdx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/use-sglang-as-backend.mdx b/docs/source/use-sglang-as-backend.mdx
index 2edcaeeb5..39d486f1e 100644
--- a/docs/source/use-sglang-as-backend.mdx
+++ b/docs/source/use-sglang-as-backend.mdx
@@ -13,7 +13,7 @@ lighteval sglang \
 parallelism and tensor parallelism.
 You can choose the parallelism method by setting in the the `model_args`.
 
-For example if you have 4 GPUs you can split it across using `tensor_parallelism`:
+For example if you have 4 GPUs you can split it across using `tp_size`:
 
 ```bash
 lighteval sglang \
@@ -21,7 +21,7 @@ lighteval sglang \
     "leaderboard|truthfulqa:mc|0|0"
 ```
 
-Or, if your model fits on a single GPU, you can use `data_parallelism` to speed up the evaluation:
+Or, if your model fits on a single GPU, you can use `dp_size` to speed up the evaluation:
 
 ```bash
 lighteval sglang \

From d8841a9be46ef661a65ade98f563de9ea0989000 Mon Sep 17 00:00:00 2001
From: Jayon02 <12012211@mail.sustech.edu.cn>
Date: Tue, 18 Feb 2025 02:32:42 +0000
Subject: [PATCH 10/10] Squashed commit of the following:

commit 1035aa5e4564669c2b094d977c4a124b2d1fe3d6
Author: Jayon02 <12012211@mail.sustech.edu.cn>
Date:   Tue Feb 18 01:31:21 2025 +0000

    modify document and fix bug

commit be58c7c259ad75ae1a83c8d95ef237e8ecee63aa
Author: Jayon02 <12012211@mail.sustech.edu.cn>
Date:   Mon Feb 17 14:35:03 2025 +0000

    modify toml

commit 86e41c98d7430ce837db9d2b24155c2522c11e3d
Merge: 132290b 50f3695
Author: Jayon02 <12012211@mail.sustech.edu.cn>
Date:   Sun Feb 16 01:30:17 2025 +0000

    Merge branch 'main' into sglang

commit 132290b571b7470ffec04461ce504eb352b031a3
Author: Jayon02 <12012211@mail.sustech.edu.cn>
Date:   Sat Feb 15 11:08:24 2025 +0000

    modify document

commit 601a75504be8320703af401eaddce2ceef40f3ca
Author: Jayon02 <12012211@mail.sustech.edu.cn>
Date:   Sat Feb 15 10:22:43 2025 +0000

    pass pre commit check and modify document

commit 3e1fb8899c073622f76c14b7a12c392cfe426a37
Author: qiujiang chen <qiujiang@lmsys.us-northcentral1-a.compute.internal>
Date:   Sat Feb 15 06:59:12 2025 +0000

    optimize input, adjust precision

commit 1a590760aa5dd188d2e31e3f6ee316433004614d
Author: qiujiang chen <qiujiang@lmsys.us-northcentral1-a.compute.internal>
Date:   Thu Feb 13 19:51:22 2025 +0000

    text files

commit 9dc62b7f55a190e42cd46016a12358c1f1b78c93
Author: qiujiang chen <qiujiang@lmsys.us-northcentral1-a.compute.internal>
Date:   Wed Feb 12 14:08:21 2025 +0000

    modify format
---
 docs/source/use-sglang-as-backend.mdx         | 43 +++++++++++--------
 .../model_configs/sglang_model_config.yaml    | 13 ++++++
 src/lighteval/main_sglang.py                  | 22 +++-------
 src/lighteval/models/model_input.py           |  1 -
 src/lighteval/models/sglang/sglang_model.py   |  3 +-
 5 files changed, 45 insertions(+), 37 deletions(-)
 create mode 100644 examples/model_configs/sglang_model_config.yaml

diff --git a/docs/source/use-sglang-as-backend.mdx b/docs/source/use-sglang-as-backend.mdx
index 39d486f1e..595e4cfb9 100644
--- a/docs/source/use-sglang-as-backend.mdx
+++ b/docs/source/use-sglang-as-backend.mdx
@@ -29,23 +29,32 @@ lighteval sglang \
     "leaderboard|truthfulqa:mc|0|0"
 ```
 
-Available arguments for `sglang` can be found in the `SGLangModelConfig`:
-
-- **pretrained** (str): HuggingFace Hub model ID name or the path to a pre-trained model to load.
-- **load_format** (str): The format the weights are loaded in. Defaults to *.safetensors/*.bin.
-- **dtype** (str): Dtype used for the model, defaults to bfloat16.
-- **tp_size** (int): The number of GPUs the model weights get sharded over.
-- **dp_size** (int): The number of data-parallel copies of the model.
-- **context_length** (int | None): The number of tokens our model can process including the input.
-- **random_seed** (int): Can be used to enforce more deterministic behavior.
-- **trust_remote_code** (bool): If True, will use locally cached config files, otherwise use remote configs in HuggingFace.
-- **skip_tokenizer_init** (bool): Set to true to provide the tokens to the engine and get the output tokens directly, typically used in RLHF.
-- **kv_cache_dtype** (str): Dtype of the kv cache, defaults to the auto.
-- **add_special_tokens** (bool): Whether to add special tokens to the input sequences.
-- **sampling_backend** (str | None): The backend for sampling.
-- **attention_backend** (str | None): The backend for attention computation and KV cache management.
-- **mem_fraction_static** (float): Fraction of the free GPU memory used for static memory like model weights and KV cache.
-- **chunked_prefill_size** (int): Perform the prefill in chunks of these size.
+## Use a config file
+
+For more advanced configurations, you can use a config file for the model.
+An example of a config file is shown below and can be found at `examples/model_configs/sglang_model_config.yaml`.
+
+```bash
+lighteval sglang \
+    "examples/model_configs/sglang_model_config.yaml" \
+    "leaderboard|truthfulqa:mc|0|0"
+```
+
+```yaml
+model: # Model specific parameters
+  base_params:
+    model_args: "pretrained=HuggingFaceTB/SmolLM-1.7B,dtype=float16,chunked_prefill_size=4096,mem_fraction_static=0.9" # Model args that you would pass in the command line
+  generation: # Generation specific parameters
+    temperature: 0.3
+    repetition_penalty: 1.0
+    frequency_penalty: 0.0
+    presence_penalty: 0.0
+    top_k: -1
+    min_p: 0.0
+    top_p: 0.9
+    max_new_tokens: 256
+    stop_tokens: ["<EOS>", "<PAD>"]
+```
 
 > [!WARNING]
 > In the case of OOM issues, you might need to reduce the context size of the
diff --git a/examples/model_configs/sglang_model_config.yaml b/examples/model_configs/sglang_model_config.yaml
new file mode 100644
index 000000000..2a980e3a8
--- /dev/null
+++ b/examples/model_configs/sglang_model_config.yaml
@@ -0,0 +1,13 @@
+model:
+  base_params:
+    model_args: "pretrained=HuggingFaceTB/SmolLM-1.7B,dtype=float16,chunked_prefill_size=4096,mem_fraction_static=0.9"
+  generation:
+    temperature: 0.3
+    repetition_penalty: 1.0
+    frequency_penalty: 0.0
+    presence_penalty: 0.0
+    top_k: -1
+    min_p: 0.0
+    top_p: 0.9
+    max_new_tokens: 256
+    stop_tokens: ["<EOS>", "<PAD>"]
diff --git a/src/lighteval/main_sglang.py b/src/lighteval/main_sglang.py
index a49840abe..b20cde512 100644
--- a/src/lighteval/main_sglang.py
+++ b/src/lighteval/main_sglang.py
@@ -19,9 +19,7 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
-import json
 import os
-import re
 from typing import Optional
 
 from typer import Argument, Option
@@ -135,23 +133,13 @@ def sglang(
     if model_args.endswith(".yaml"):
         with open(model_args, "r") as f:
             config = yaml.safe_load(f)["model"]
+        model_args = config["base_params"]["model_args"]
         generation_parameters = GenerationParameters.from_dict(config)
-        model_config = SGLangModelConfig(config, generation_parameters=generation_parameters)
-
     else:
-        pattern = re.compile(r"(\w+)=(\{.*\}|[^,]+)")
-        matches = pattern.findall(model_args)
-        model_args_dict = {}
-        generation_params = None
-        for key, value in matches:
-            key = key.strip()
-            if key == "generation_parameters":
-                value = re.sub(r"(\w+):", r'"\1":', value)
-                value = json.loads(value)
-                generation_params = GenerationParameters(**value)
-            else:
-                model_args_dict[key] = value
-        model_config = SGLangModelConfig(**model_args_dict, generation_parameters=generation_params)
+        generation_parameters = GenerationParameters()
+
+    model_args_dict: dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in model_args.split(",")}
+    model_config = SGLangModelConfig(**model_args_dict, generation_parameters=generation_parameters)
 
     pipeline = Pipeline(
         tasks=tasks,
diff --git a/src/lighteval/models/model_input.py b/src/lighteval/models/model_input.py
index 56b9f379f..ebbf46f22 100644
--- a/src/lighteval/models/model_input.py
+++ b/src/lighteval/models/model_input.py
@@ -158,7 +158,6 @@ def to_tgi_ie_dict(self) -> dict:
     def to_sglang_dict(self) -> dict:
         args = {
             "max_new_tokens": self.max_new_tokens,
-            "stop_token_ids": self.stop_tokens,
             "temperature": self.temperature,
             "stop": self.stop_tokens,
             "top_p": self.top_p,
diff --git a/src/lighteval/models/sglang/sglang_model.py b/src/lighteval/models/sglang/sglang_model.py
index 9bb8af7bd..08346f325 100644
--- a/src/lighteval/models/sglang/sglang_model.py
+++ b/src/lighteval/models/sglang/sglang_model.py
@@ -136,7 +136,7 @@ def _create_auto_model(self, config: SGLangModelConfig, env_config: EnvConfig) -
             "device": "cuda",
             "random_seed": config.random_seed,
             "load_format": config.load_format,
-            "context_length": int(self._max_length) if self._max_length else 8192,
+            "context_length": self._max_length,
             "dp_size": int(config.dp_size),
             "tp_size": int(config.tp_size),
             "sampling_backend": config.sampling_backend,
@@ -146,7 +146,6 @@ def _create_auto_model(self, config: SGLangModelConfig, env_config: EnvConfig) -
             "chunked_prefill_size": int(config.chunked_prefill_size),
             "disable_radix_cache": True,
         }
-
         model = Engine(**self.model_args)
 
         if self._max_length is None: