From f39de4654897c5faf3e34c8917b0e79e62e10f37 Mon Sep 17 00:00:00 2001 From: JADDYK <12012211@mail.sustech.edu.cn> Date: Fri, 7 Feb 2025 22:15:18 +0800 Subject: [PATCH 01/10] support sglang --- src/lighteval/__main__.py | 3 + src/lighteval/main_sglang.py | 163 +++++++ src/lighteval/main_vllm.py | 3 + src/lighteval/models/model_input.py | 5 + src/lighteval/models/model_loader.py | 15 +- src/lighteval/models/sglang/sglang_model.py | 456 ++++++++++++++++++++ src/lighteval/models/vllm/vllm_model.py | 5 +- src/lighteval/pipeline.py | 8 +- src/lighteval/tasks/registry.py | 1 + src/lighteval/utils/imports.py | 6 +- 10 files changed, 660 insertions(+), 5 deletions(-) create mode 100644 src/lighteval/main_sglang.py create mode 100644 src/lighteval/models/sglang/sglang_model.py diff --git a/src/lighteval/__main__.py b/src/lighteval/__main__.py index e4053813e..77312f57c 100644 --- a/src/lighteval/__main__.py +++ b/src/lighteval/__main__.py @@ -31,6 +31,7 @@ import lighteval.main_nanotron import lighteval.main_tasks import lighteval.main_vllm +import lighteval.main_sglang app = typer.Typer() @@ -63,7 +64,9 @@ app.command(rich_help_panel="Evaluation Backends")(lighteval.main_accelerate.accelerate) app.command(rich_help_panel="Evaluation Utils")(lighteval.main_baseline.baseline) app.command(rich_help_panel="Evaluation Backends")(lighteval.main_nanotron.nanotron) +# Jayon02: add vllm cmd app.command(rich_help_panel="Evaluation Backends")(lighteval.main_vllm.vllm) +app.command(rich_help_panel="Evaluation Backends")(lighteval.main_sglang.sglang) app.add_typer( lighteval.main_endpoint.app, name="endpoint", diff --git a/src/lighteval/main_sglang.py b/src/lighteval/main_sglang.py new file mode 100644 index 000000000..0c662a9dd --- /dev/null +++ b/src/lighteval/main_sglang.py @@ -0,0 +1,163 @@ +# TODO: change to what? +# MIT License + +# Copyright (c) 2024 The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +import os +from typing import Optional + +from typer import Argument, Option +from typing_extensions import Annotated + + +TOKEN = os.getenv("HF_TOKEN") +CACHE_DIR: str = os.getenv("HF_HOME", "/scratch") + +HELP_PANEL_NAME_1 = "Common Parameters" +HELP_PANEL_NAME_2 = "Logging Parameters" +HELP_PANEL_NAME_3 = "Debug Parameters" +HELP_PANEL_NAME_4 = "Modeling Parameters" + +# TODO: change +def sglang( + # === general === + model_args: Annotated[ + str, + Argument( + help="Model arguments in the form key1=value1,key2=value2,... or path to yaml config file (see examples/model_configs/transformers_model.yaml)" + ), + ], + tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], + # === Common parameters === + use_chat_template: Annotated[ + bool, Option(help="Use chat template for evaluation.", rich_help_panel=HELP_PANEL_NAME_4) + ] = False, + system_prompt: Annotated[ + Optional[str], Option(help="Use system prompt for evaluation.", rich_help_panel=HELP_PANEL_NAME_4) + ] = None, + dataset_loading_processes: Annotated[ + int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1) + ] = 1, + custom_tasks: Annotated[ + Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1) + ] = None, + cache_dir: Annotated[ + str, Option(help="Cache directory for datasets and models.", rich_help_panel=HELP_PANEL_NAME_1) + ] = CACHE_DIR, + num_fewshot_seeds: Annotated[ + int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1) + ] = 1, + load_responses_from_details_date_id: Annotated[ + Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1) + ] = None, + # === saving === + output_dir: Annotated[ + str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) + ] = "results", + push_to_hub: Annotated[ + bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2) + ] = False, + push_to_tensorboard: Annotated[ + bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2) + ] = False, + public_run: Annotated[ + bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2) + ] = False, + results_org: Annotated[ + Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2) + ] = None, + save_details: Annotated[ + bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2) + ] = False, + # === debug === + max_samples: Annotated[ + Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3) + ] = None, + job_id: Annotated[ + int, Option(help="Optional job id for future reference.", rich_help_panel=HELP_PANEL_NAME_3) + ] = 0, +): + """ + Evaluate models using vllm as backend. + """ + import yaml + + from lighteval.logging.evaluation_tracker import EvaluationTracker + from lighteval.models.model_input import GenerationParameters + from lighteval.models.sglang.sglang_model import SGLANGModelConfig + from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters + + TOKEN = os.getenv("HF_TOKEN") + + env_config = EnvConfig(token=TOKEN, cache_dir=cache_dir) + + evaluation_tracker = EvaluationTracker( + output_dir=output_dir, + save_details=save_details, + push_to_hub=push_to_hub, + push_to_tensorboard=push_to_tensorboard, + public=public_run, + hub_results_org=results_org, + ) + + ## Jayon02: vllm pipeline parameter + pipeline_params = PipelineParameters( + launcher_type=ParallelismManager.SGLANG, + env_config=env_config, + job_id=job_id, + dataset_loading_processes=dataset_loading_processes, + custom_tasks_directory=custom_tasks, + override_batch_size=-1, + num_fewshot_seeds=num_fewshot_seeds, + max_samples=max_samples, + use_chat_template=use_chat_template, + system_prompt=system_prompt, + load_responses_from_details_date_id=load_responses_from_details_date_id, + ) + + ## Jayon02: support two ways to load model + if model_args.endswith(".yaml"): + with open(model_args, "r") as f: + config = yaml.safe_load(f)["model"] + generation_parameters = GenerationParameters.from_dict(config) + model_config = SGLANGModelConfig(config, generation_parameters=generation_parameters) + + else: + ## cmd arg + model_args_dict: dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in model_args.split(",")} + + model_config = SGLANGModelConfig(**model_args_dict) + + pipeline = Pipeline( + tasks=tasks, + pipeline_parameters=pipeline_params, + evaluation_tracker=evaluation_tracker, + model_config=model_config, + ) + + pipeline.evaluate() + + pipeline.show_results() + + results = pipeline.get_results() + + pipeline.save_and_push_results() + + return results diff --git a/src/lighteval/main_vllm.py b/src/lighteval/main_vllm.py index d063c3fa8..90132dcba 100644 --- a/src/lighteval/main_vllm.py +++ b/src/lighteval/main_vllm.py @@ -116,6 +116,7 @@ def vllm( hub_results_org=results_org, ) + ## Jayon02: vllm pipeline parameter pipeline_params = PipelineParameters( launcher_type=ParallelismManager.VLLM, env_config=env_config, @@ -130,6 +131,7 @@ def vllm( load_responses_from_details_date_id=load_responses_from_details_date_id, ) + ## Jayon02: support two ways to load model if model_args.endswith(".yaml"): with open(model_args, "r") as f: config = yaml.safe_load(f)["model"] @@ -137,6 +139,7 @@ def vllm( model_config = VLLMModelConfig(config, generation_parameters=generation_parameters) else: + ## cmd arg model_args_dict: dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in model_args.split(",")} model_config = VLLMModelConfig(**model_args_dict) diff --git a/src/lighteval/models/model_input.py b/src/lighteval/models/model_input.py index 04e35be17..687a561a3 100644 --- a/src/lighteval/models/model_input.py +++ b/src/lighteval/models/model_input.py @@ -119,3 +119,8 @@ def to_tgi_ie_dict(self) -> dict: "truncate": self.truncate_prompt, } return {k: v for k, v in args.items() if v is not None} + + # TODO first: sampling parameter + def to_sglang_dict(self) -> dict: + return {k: v for k, v in asdict(self).items() if v is not None} + diff --git a/src/lighteval/models/model_loader.py b/src/lighteval/models/model_loader.py index 68835fda7..3bf76af7d 100644 --- a/src/lighteval/models/model_loader.py +++ b/src/lighteval/models/model_loader.py @@ -32,6 +32,7 @@ from lighteval.models.endpoints.openai_model import OpenAIClient, OpenAIModelConfig from lighteval.models.endpoints.tgi_model import ModelClient, TGIModelConfig from lighteval.models.litellm_model import LiteLLMClient, LiteLLMModelConfig +from lighteval.models.sglang.sglang_model import SGLANGModelConfig, SGLANGModel from lighteval.models.transformers.adapter_model import AdapterModel, AdapterModelConfig from lighteval.models.transformers.delta_model import DeltaModel, DeltaModelConfig from lighteval.models.transformers.transformers_model import TransformersModel, TransformersModelConfig @@ -43,7 +44,7 @@ is_litellm_available, is_openai_available, is_tgi_available, - is_vllm_available, + is_vllm_available, is_sglang_available, NO_SGLANG_ERROR_MSG, ) from lighteval.utils.utils import EnvConfig @@ -62,6 +63,7 @@ def load_model( # noqa: C901 VLLMModelConfig, OpenAIModelConfig, LiteLLMModelConfig, + SGLANGModelConfig, ], env_config: EnvConfig, ) -> Union[TransformersModel, AdapterModel, DeltaModel, ModelClient, DummyModel]: @@ -96,6 +98,11 @@ def load_model( # noqa: C901 if isinstance(config, VLLMModelConfig): return load_model_with_accelerate_or_default(config=config, env_config=env_config) + if isinstance(config, SGLANGModelConfig): + # TODO: double check + # return load_model_with_accelerate_or_default(config=config, env_config=env_config) + return load_sglang_model(config=config, env_config=env_config) + if isinstance(config, OpenAIModelConfig): return load_openai_model(config=config, env_config=env_config) @@ -159,3 +166,9 @@ def load_model_with_accelerate_or_default( def load_dummy_model(config: DummyModelConfig, env_config: EnvConfig): return DummyModel(config=config, env_config=env_config) + +def load_sglang_model(config: SGLANGModelConfig, env_config: EnvConfig): + if not is_sglang_available(): + raise ImportError(NO_SGLANG_ERROR_MSG) + + return SGLANGModel(config=config, env_config=env_config) diff --git a/src/lighteval/models/sglang/sglang_model.py b/src/lighteval/models/sglang/sglang_model.py new file mode 100644 index 000000000..6b81d2a8f --- /dev/null +++ b/src/lighteval/models/sglang/sglang_model.py @@ -0,0 +1,456 @@ +# MIT License + +# Copyright (c) 2024 The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import gc +import itertools +import logging +import os +from dataclasses import dataclass +from typing import Optional + +import torch +from tqdm import tqdm + +from lighteval.data import GenerativeTaskDataset, LoglikelihoodDataset +from lighteval.models.abstract_model import LightevalModel, ModelInfo +from lighteval.models.model_input import GenerationParameters +from lighteval.models.model_output import ( + GenerativeResponse, + LoglikelihoodResponse, +) +from lighteval.models.utils import _get_dtype, _simplify_name +from lighteval.tasks.requests import ( + GreedyUntilRequest, + LoglikelihoodRequest, +) +from lighteval.utils.imports import is_sglang_available +from lighteval.utils.utils import EnvConfig, as_list + + +logger = logging.getLogger(__name__) + +from more_itertools import distribute +from sglang import Engine +from sglang.srt.hf_transformers_utils import get_tokenizer +from sglang.lang.ir import SglSamplingParams +from sglang.srt.distributed.parallel_state import destroy_model_parallel, destroy_distributed_environment + +# from vllm.distributed.parallel_state import destroy_distributed_environment, destroy_model_parallel + +logging.getLogger("sglang").propagate = True +logging.getLogger("sglang").handlers.clear() + +## Jayon02: sglang with what dependency, ray? flashinfer? +# if is_sglang_available(): +# from more_itertools import distribute +# from vllm import LLM, SamplingParams +# from vllm.distributed.parallel_state import destroy_distributed_environment, destroy_model_parallel +# from vllm.transformers_utils.tokenizer import get_tokenizer +# +# logging.getLogger("sglang").propagate = True +# logging.getLogger("sglang").handlers.clear() +# else: +# LLM = None +# SamplingParams = None +# get_tokenizer = None +# distribute = None + +os.environ["TOKENIZERS_PARALLELISM"] = "false" + +STARTING_BATCH_SIZE = 512 + +## change to all sglang config +@dataclass +class SGLANGModelConfig: + pretrained: str # + trust_remote_code: bool = True # + dtype: str = "auto" # + tensor_parallel_size: int = 1 # how many GPUs to use for tensor parallelism + device: str = "cuda" + disable_radix_cache: bool = True + seed: int = 42 # + disable_cuda_graph: bool = True + disable_cuda_graph_padding: bool = True + max_model_length: int | None = None # maximum length of the model, ussually infered automatically. reduce this if you encouter OOM issues, 4096 is usually enough + return_token_ids: bool = True + + gpu_memory_utilisation: float = 0.9 # lower this if you are running out of memory + revision: str = "main" # revision of the model + pipeline_parallel_size: int = 1 # how many GPUs to use for pipeline parallelism + data_parallel_size: int = 1 # how many GPUs to use for data parallelism + swap_space: int = 4 # CPU swap space size (GiB) per GPU. + use_chat_template: bool = False + add_special_tokens: bool = True + multichoice_continuations_start_space: bool = ( + True # whether to add a space at the start of each continuation in multichoice generation + ) + pairwise_tokenization: bool = False # whether to tokenize the context and continuation separately or together. + generation_parameters: GenerationParameters = None # sampling parameters to use for generation + + subfolder: Optional[str] = None + + def __post_init__(self): + if not self.generation_parameters: + self.generation_parameters = GenerationParameters() + + +class SGLANGModel(LightevalModel): + def __init__( + self, + config: SGLANGModelConfig, + env_config: EnvConfig, + ): + """Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation.""" + self._config = config + self.use_chat_template = config.use_chat_template + self.data_parallel_size = int(config.data_parallel_size) + self.tensor_parallel_size = int(config.tensor_parallel_size) + + self._add_special_tokens = config.add_special_tokens if config.add_special_tokens is not None else False + self._tokenizer = self._create_auto_tokenizer(config, env_config) + + self._max_length = int(config.max_model_length) if config.max_model_length is not None else None + + # If model_parallel is not set we compare the number of processes with the number of GPUs + self.model = self._create_auto_model(config, env_config) + + # self._device = config.accelerator.device if config.accelerator is not None else "cpu" + self.multichoice_continuations_start_space = config.multichoice_continuations_start_space + + self.model_name = _simplify_name(config.pretrained) + self.model_sha = "" # config.get_model_sha() + self.precision = _get_dtype(config.dtype, config=self._config) + + self.model_info = ModelInfo(model_name=self.model_name, model_sha=self.model_sha) + self.sampling_params = SglSamplingParams(**config.generation_parameters.to_sglang_dict()) + self.pairwise_tokenization = config.pairwise_tokenization + + @property + def tokenizer(self): + return self._tokenizer + + def cleanup(self): + destroy_model_parallel() + if self.model is not None: + del self.model.llm_engine.model_executor.driver_worker + self.model = None + gc.collect() + # TODO: check sglang dependency: ray flashinfer ray? + # ray.shutdown() + destroy_distributed_environment() + torch.cuda.empty_cache() + + @property + def add_special_tokens(self): + return self._add_special_tokens + + @property + def max_length(self) -> int: + return self._max_length + + def _create_auto_model(self, config: SGLANGModelConfig, env_config: EnvConfig) -> Optional[Engine]: + """ + Creates an instance of the pretrained HF model. + + Args: + pretrained (str): The name or path of the pretrained model. + revision (str): The revision of the model. + subfolder (Optional[str], optional): The subfolder within the model. Defaults to None. + max_memory (Optional[dict], optional): The maximum memory to allocate for the model per GPU. Defaults to None. + device_map (Optional[dict], optional): The device mapping for the model. Defaults to None. + torch_dtype (Optional[Union[str, torch.dtype]], optional): The torch data type for the model. Defaults to None. + quantization_config (Optional[Union[BitsAndBytesConfig, GPTQConfig]], optional): The quantization configuration for the model. Defaults to None. + trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False. + cache_dir (str, optional): The cache directory for the model. Defaults to "/scratch". + + Returns: + transformers.PreTrainedModel: The created auto model instance. + """ + # self.model_args = { + # "model": config.pretrained, + # "gpu_memory_utilization": float(config.gpu_memory_utilisation), + # "revision": config.revision + (f"/{config.subfolder}" if config.subfolder is not None else ""), + # "dtype": config.dtype, + # "trust_remote_code": config.trust_remote_code, + # "tensor_parallel_size": int(config.tensor_parallel_size), + # "pipeline_parallel_size": int(config.pipeline_parallel_size), + # "max_model_len": self._max_length, + # "swap_space": 4, + # "seed": 1234, + # } + + # TODO: double check + self.model_args = { + "model_path": config.pretrained, + "trust_remote_code": config.trust_remote_code, + "dtype": config.dtype, + "tp_size": int(config.tensor_parallel_size), + "device": "cuda", + "disable_radix_cache": config.disable_radix_cache, + "random_seed": config.seed, + "disable_cuda_graph": config.disable_cuda_graph, + "disable_cuda_graph_padding": config.disable_cuda_graph_padding, + "context_length": self._max_length, + "log_level": "info", + "return_token_ids": True, + + "revision": config.revision + (f"/{config.subfolder}" if config.subfolder is not None else ""), + } + + # TODO: double check + # if int(config.data_parallel_size) > 1: + # self.model_args["distributed_executor_backend"] = "ray" + # self._batch_size = "auto" + # return None + + model = Engine(**self.model_args) + + # TODO: double check + # If the max_length can't get extracted from the config, it will be inferred from the model + # Inferring from the tokenizer will cause vllm to bug for models with mismatches between model + # config and tk config, like mistralai/Mistral-7B-v0.1 + # if self._max_length is None: + # self._max_length = model.llm_engine.model_config.max_seq_len_to_capture + + return model + + def _create_auto_tokenizer(self, config: SGLANGModelConfig, env_config: EnvConfig): + tokenizer = get_tokenizer( + config.pretrained, + tokenizer_mode="auto", + trust_remote_code=config.trust_remote_code, + tokenizer_revision=config.revision, + ) + tokenizer.pad_token = tokenizer.eos_token + return tokenizer + + def greedy_until( + self, + requests: list[GreedyUntilRequest], + override_bs: Optional[int] = None, + ) -> list[GenerativeResponse]: + """ + Generates responses using a greedy decoding strategy until certain ending conditions are met. + + Args: + requests (list[Request]): list of requests containing the context and ending conditions. + override_bs (int, optional): Override the batch size for generation. Defaults to None. + + Returns: + list[GenerateReturn]: list of generated responses. + """ + for request in requests: + request.stop_sequence = as_list(request.stop_sequence) + [self.tokenizer.eos_token] + request.tokenized_context = self.tok_encode(request.context) + + dataset = GenerativeTaskDataset(requests=requests, num_dataset_splits=self.DATASET_SPLITS) + results = [] + + for _ in tqdm( + dataset.splits_start_end_iterator(), + total=dataset.num_dataset_splits, + desc="Splits", + position=0, + disable=False, # self.disable_tqdm, + ): + # For chat models, generation stops with EOS token, so we don't need to specify stop tokens + if self.use_chat_template: + stop_tokens = [] + else: + # NOTE: we are assuming all items in a batch behave similarly (same + # stop_tokens and max_tokens genrated) which is not necessarily + # the case! Because of that we only use batch size of 1 + stop_tokens = dataset[0].stop_sequence + + max_new_tokens = dataset[0].generation_size # could be none + returns_logits = dataset[0].use_logits + num_samples = dataset[0].num_samples + + context = [c.context for c in dataset] + tokenized = self.tokenizer(context, add_special_tokens=self.add_special_tokens) + + # The main question for this step is the following: + # Would we rather truncate the prompt to allow generation to go to max_new_tokens, at the risk + # of losing some meaning, or have some generations that are exceedingly short? + # The choice we go for here is to avoid truncating the prompt if we can, since it + # should have been managed by the prompt creator/few shot manager if requested by the user. + inputs = tokenized["input_ids"] + context_size = len(inputs[0]) + + # left truncate the inputs to the maximum length + if max_new_tokens is not None: + if context_size + max_new_tokens > self.max_length: + logger.warning( + f"{context_size + max_new_tokens=} which is greather than {self.max_length=}. Truncating context to {self.max_length - max_new_tokens} tokens." + ) + context_size = self.max_length - max_new_tokens + inputs = [input[-context_size:] for input in inputs] + else: + if context_size > self.max_length: + logger.warning( + f"{context_size=} which is greather than {self.max_length=}. Truncating context to {self.max_length} tokens." + ) + context_size = self.max_length + inputs = [input[-context_size:] for input in inputs] + + sglang_outputs = self._generate( + inputs=inputs, + max_new_tokens=max_new_tokens, + stop_tokens=stop_tokens, + returns_logits=returns_logits, + num_samples=num_samples, + ) + + for sglang_output in sglang_outputs: + output_token_ids = [outputs.token_ids for outputs in sglang_output.outputs] + logprobs = [output.logprobs for output in sglang_output.outputs] or [] + logprobs = [logprob[token_id].logprob for token_id, logprob in zip(output_token_ids[0], logprobs[0])] + result = [output.text for output in sglang_output.outputs] + input_token_ids = sglang_output.prompt_token_ids + + cur_response = GenerativeResponse( + result=result, + logits=logprobs, + generated_tokens=list(output_token_ids), + input_tokens=input_token_ids, + ) + results.append(cur_response) + + return dataset.get_original_order(results) + + def _generate( + self, + inputs: list[list[int]], + max_new_tokens: Optional[int] = None, + stop_tokens: Optional[list[str]] = None, + returns_logits: Optional[bool] = False, + num_samples: int = 1, + generate: bool = True, + ) -> list[GenerativeResponse]: + """Contains the actual logic of the generation.""" + sampling_params = self.sampling_params.clone() or SglSamplingParams() + if generate: + sampling_params.n = num_samples + sampling_params.max_tokens = max_new_tokens + sampling_params.stop = stop_tokens + sampling_params.logprobs = 1 if returns_logits else 0 + + else: + sampling_params.temperature = 0 + sampling_params.prompt_logprobs = 1 + sampling_params.max_tokens = 1 + sampling_params.detokenize = False + + ## Jayon02: how do sglang handle this + # if self.data_parallel_size > 1: + # # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote + # # also seems to only work with decorator and not with ray.remote() fn + # # see https://github.com/vllm-project/vllm/issues/973 + # # note: this has changed on 0.3.3, and it only works now if num_gpus are set. + # # but then tensor_parallel breaks + # # Hynek: With the newest vllm, it actually breaks when tensor_parallel_size == 1 and num_gpus not set, + # # as VLLM complains about no GPUs available. + # @ray.remote(num_gpus=1 if self.tensor_parallel_size == 1 else None) + # def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, requests): + # llm = LLM(**model_args) + # return llm.generate(prompt_token_ids=requests, sampling_params=sampling_params) + # + # # dispatch requests to all self.data_parallel_size workers, in interleaved fashion + # # interleaved important to balance context lengths across workers + # requests = [list(x) for x in distribute(self.data_parallel_size, inputs)] + # inputs = ((self.model_args, sampling_params, req) for req in requests) + # object_refs = [run_inference_one_model.remote(*x) for x in inputs] + # results = ray.get(object_refs) + # # Invoke ray.shutdown() to prevent hang-ups if subsequent calls required. + # ray.shutdown() + # # flatten results + # outputs = [ + # x + # for x in itertools.chain.from_iterable(itertools.zip_longest(*[list(x) for x in results])) + # if x is not None + # ] + # else: + # outputs = self.model.generate( + # prompt_token_ids=inputs, + # sampling_params=sampling_params, + # use_tqdm=True, + # ) + + outputs = self.model.generate( + prompt_token_ids=inputs, + sampling_params=sampling_params, + use_tqdm=True, + ) + + return outputs + + def loglikelihood( + self, requests: list[LoglikelihoodRequest], override_bs: Optional[int] = None + ) -> list[LoglikelihoodResponse]: + for request in requests: + if request.context == "": + request.tokenized_context = [self.tokenizer.eos_token_id] + request.tokenized_continuation = self.tok_encode(request.choice) + else: + # The following line is mandatory for compatibility with the harness + request.tokenized_context, request.tokenized_continuation = self.tok_encode_pair( + request.context, request.choice, pairwise=self.pairwise_tokenization + ) + return self._loglikelihood_tokens(requests, override_bs=override_bs) + + def _loglikelihood_tokens( + self, + requests: list[LoglikelihoodRequest], + override_bs: int = -1, + return_bool_score: bool = True, + rolling: bool = False, + ) -> list[LoglikelihoodResponse]: + dataset = LoglikelihoodDataset(requests=requests, num_dataset_splits=1) + res = [] + + for _ in tqdm(dataset.splits_start_end_iterator()): + # the last token is an eos token, so we don't need to add it + inputs = [dataset[i].tokenized_context + dataset[i].tokenized_continuation for i in range(len(dataset))] + # Left truncate the inputs to the maximum length + inputs = [input[-self.max_length :] for input in inputs] + outputs = self._generate(inputs, generate=False) + + for output, input in zip(outputs, dataset): + continuation_logprobs = [] + for token, logprobs in zip(input.tokenized_continuation[::-1], output.prompt_logprobs[::-1]): + continuation_logprobs.append(logprobs[token]) + bool_score = all(logprob.rank == 1 for logprob in continuation_logprobs) + continuation_logprobs = [logprob.logprob for logprob in continuation_logprobs] + answer = LoglikelihoodResponse( + input_tokens=input.tokenized_context + input.tokenized_continuation, + generated_tokens=input.tokenized_continuation, + result=(sum(continuation_logprobs), bool_score if return_bool_score else None), + ) + res.append(answer) + + return dataset.get_original_order(res) + + def loglikelihood_rolling(): + pass + + def loglikelihood_single_token(): + pass diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index 0bb13a6f1..9d666b4a5 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -48,7 +48,7 @@ logger = logging.getLogger(__name__) - +## Jayon02: sglang with what dependency, ray? flashinfer? if is_vllm_available(): import ray from more_itertools import distribute @@ -72,7 +72,7 @@ STARTING_BATCH_SIZE = 512 - +## change to all sglang config @dataclass class VLLMModelConfig: pretrained: str @@ -326,6 +326,7 @@ def _generate( sampling_params.max_tokens = 1 sampling_params.detokenize = False + ## Jayon02: how do sglang handle this if self.data_parallel_size > 1: # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote # also seems to only work with decorator and not with ray.remote() fn diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py index 0e6282ef5..148c1849e 100644 --- a/src/lighteval/pipeline.py +++ b/src/lighteval/pipeline.py @@ -57,7 +57,7 @@ is_nanotron_available, is_openai_available, is_tgi_available, - is_vllm_available, + is_vllm_available, is_sglang_available, NO_SGLANG_ERROR_MSG, ) from lighteval.utils.parallelism import test_all_gather from lighteval.utils.utils import EnvConfig, make_results_table @@ -86,6 +86,7 @@ class ParallelismManager(Enum): OPENAI = auto() VLLM = auto() NONE = auto() + SGLANG = auto() @dataclass @@ -113,6 +114,9 @@ def __post_init__(self): # noqa C901 elif self.launcher_type == ParallelismManager.VLLM: if not is_vllm_available(): raise ImportError(NO_VLLM_ERROR_MSG) + elif self.launcher_type == ParallelismManager.SGLANG: + if not is_sglang_available(): + raise ImportError(NO_SGLANG_ERROR_MSG) elif self.launcher_type == ParallelismManager.TGI: if not is_tgi_available(): raise ImportError(NO_TGI_ERROR_MSG) @@ -189,7 +193,9 @@ def _init_model(self, model_config, model): env_config=self.pipeline_parameters.env_config, ) else: + ## Jayon02: load model into vllm return load_model(config=model_config, env_config=self.pipeline_parameters.env_config) + if isinstance(model, TransformersModel): return model else: diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index 174a98d33..71a8c2bb1 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -299,6 +299,7 @@ def taskinfo_selector(tasks: str, task_registry: Registry) -> tuple[list[str], d expanded_tasks = task_registry.task_groups_dict.get(maybe_task_group, [maybe_task_group]) expanded_tasks_list.extend(expanded_tasks) + ## task expand and few shot number record, not load task for task in expanded_tasks_list: try: suite_name, task_name, few_shot, truncate_few_shots = tuple(task.split("|")) diff --git a/src/lighteval/utils/imports.py b/src/lighteval/utils/imports.py index 9b92adcee..14e3b94da 100644 --- a/src/lighteval/utils/imports.py +++ b/src/lighteval/utils/imports.py @@ -87,9 +87,13 @@ def is_litellm_available() -> bool: def is_vllm_available() -> bool: return importlib.util.find_spec("vllm") is not None and importlib.util.find_spec("ray") is not None - NO_VLLM_ERROR_MSG = "You are trying to use an VLLM model, for which you need `vllm` and `ray`, which are not available in your environment. Please install them using pip, `pip install vllm ray`." +# TODO: need review +def is_sglang_available() -> bool: + return importlib.util.find_spec("sglang") is not None and importlib.util.find_spec("flashinfer") is not None + +NO_SGLANG_ERROR_MSG = "You are trying to use an sglang model, for which you need `sglang` and `flashinfer`, which are not available in your environment. Please install them using pip, `pip install vllm ray`." def can_load_extended_tasks() -> bool: imports = [] From 22dc29b7effbf9876848ecaad96a3a1c983cc0ac Mon Sep 17 00:00:00 2001 From: Jayon02 <12012211@mail.sustech.edu.cn> Date: Sat, 8 Feb 2025 10:20:29 +0800 Subject: [PATCH 02/10] output bugs --- sglang_inputs_token.txt | 16 ++++ sglang_output.txt | 11 +++ sglang_sampling_para.txt | 1 + src/lighteval/models/model_input.py | 16 +++- src/lighteval/models/sglang/sglang_model.py | 85 +++++++++++++-------- src/lighteval/models/vllm/vllm_model.py | 1 - vllm_inputs_token.txt | 16 ++++ vllm_output.txt | 1 + vllm_sampling_para.txt | 1 + 9 files changed, 112 insertions(+), 36 deletions(-) create mode 100644 sglang_inputs_token.txt create mode 100644 sglang_output.txt create mode 100644 sglang_sampling_para.txt create mode 100644 vllm_inputs_token.txt create mode 100644 vllm_output.txt create mode 100644 vllm_sampling_para.txt diff --git a/sglang_inputs_token.txt b/sglang_inputs_token.txt new file mode 100644 index 000000000..76bbf4434 --- /dev/null +++ b/sglang_inputs_token.txt @@ -0,0 +1,16 @@ +1,8789,13,1889,1984,15503,2472,5888,13,12,1270,1848,1783,28732,944,28725,2195,1148,1329,13,12,12,944,28723,8554,1148,327,2195,1148,13,1889,1984,3540,2472,28732,5183,15503,2472,1329,13,12,1270,1848,1783,6743,944,28725,1083,1148,1329,13,12,12,944,28723,1666,1148,327,1083,1148,13,28744,327,1984,15503,2472,28732,28782,28731,13,28724,327,1984,3540,2472,28732,28744,28723,8554,1148,28731,13,13940,28832,13,3195,349,272,1192,302,337,28723,1666,1148,438,272,948,302,456,2007,28804 +1,8789,13,28715,327,371,28740,28747,464,21558,647,28705,28750,28747,464,17664,647,28705,28770,28747,464,12684,3970,14491,13,1042,28730,327,9842,13,1392,613,297,2819,28732,2004,28732,28715,24770,13,12,335,613,1239,28705,28750,859,28705,28740,28747,13,12,12,1042,28730,2679,281,28792,28710,28793,13,13940,28832,13,3195,349,272,1192,302,1117,28730,1024,456,2007,4546,274,28804 +1,8789,13,1270,408,28732,28711,1329,13,12,335,307,859,28705,28740,28747,13,12,12,807,28705,28740,13,12,2013,28747,13,12,12,807,307,648,408,28732,28711,28733,28740,28731,13,28764,327,408,28732,28740,28734,28731,13,28724,327,408,6422,28740,28734,28731,13,13940,28832,13,3195,1212,302,5851,1235,456,2007,7072,28804 +1,8789,13,1889,1984,2472,5888,13,12,1270,1848,1783,6743,944,28725,2095,1329,13,12,12,944,28723,1148,327,2095,13,28744,327,1984,2472,28732,28782,28731,13,28724,327,1984,2472,28732,28744,28723,1148,28731,13,13940,28832,13,3195,349,272,1192,302,337,28723,1148,438,272,948,302,456,2007,28804 +1,8789,13,1889,1984,2472,5888,13,12,1270,1848,1783,6743,944,28725,2095,1329,13,12,12,944,28723,1148,327,2095,13,28744,327,1984,2472,28732,28782,28731,13,28724,327,1984,2472,28732,28744,28723,1148,28731,13,13940,28832,13,3195,349,272,1192,302,1318,28723,1148,438,272,948,302,456,2007,28804 +1,8789,13,1270,1369,28732,28744,1329,13,12,1392,613,297,2819,28732,28750,28725,1318,348,28732,28734,28723,28782,24770,13,12,12,335,1318,1239,716,28732,28710,28731,859,28705,28734,28747,13,12,12,12,807,8250,13,12,807,6110,13,13940,28832,13,3195,349,1369,28732,28740,28734,28740,11840 +1,8789,13,1270,285,28740,5888,13,12,807,1117,28732,28744,28731,648,464,21558,28742,13,1270,285,28750,5888,13,12,807,285,28740,28732,28750,28736,28744,28731,13,28744,327,285,28750,28732,28782,28750,28781,28731,13,13940,28832,13,3195,349,272,1192,302,1318,438,272,948,302,456,2007,28804 +1,8789,13,1270,285,28740,28732,28744,1329,13,12,28744,28792,28734,28793,2679,28705,28740,13,28724,327,285,28740,28732,28740,28731,13,28764,327,285,28750,5187,28740,2803,13,28712,327,285,28740,857,28775,1481,13,13940,28832,13,3195,1212,302,5851,1235,456,2007,7072,28804 +1,8789,13,1889,1984,2472,5888,13,12,1270,1848,1783,6743,944,28725,2095,1329,13,12,12,944,28723,1148,327,2095,13,28744,327,1984,2472,28732,28782,28731,13,28724,327,1984,2472,28732,28744,28723,1148,28731,13,13940,28832,13,3195,1212,302,1928,349,1318,28804 +1,8789,13,1801,327,28705,28734,13,1392,613,297,2819,28732,28740,28734,28734,28725,28705,28734,28725,387,28750,1329,13,12,1801,2679,613,13,13940,28832,13,3195,349,272,1192,302,1024,272,4008,727,1407,28705,28770,349,15731,28804 +1,8789,13,1270,18328,28732,28744,1329,13,12,807,1318,28736,28744,13,28724,327,18328,28732,28782,28731,13,28744,327,18328,28732,28724,28731,13,13940,28832,13,3195,349,272,1192,302,337,438,272,948,302,456,2007,28804 +1,8789,13,28744,327,28705,28782,13,28724,327,28705,28770,13,28764,327,28705,28787,13,28744,327,337,648,1318,13,13940,28832,13,3195,349,272,1192,302,1318,438,1407,28705,28750,28804 +1,8789,13,28744,327,28705,28740,13,335,1318,876,28705,28770,28747,13,12,28724,327,464,21558,28742,13,13940,28832,13,3195,349,272,1192,302,337,438,1407,28705,28781,28804 +1,8789,13,7841,25637,390,7494,13,28744,327,7494,28723,3506,385,28732,28740,28734,28731,13,13940,28832,13,3195,349,272,1192,302,1318,438,272,948,302,456,2007,28804 +1,8789,13,28724,327,1274,28732,2022,28732,501,28725,5936,28740,647,464,21558,647,464,28750,16433,13,13940,28832,13,3195,2118,1235,456,2007,7072,28804 +1,8789,13,335,1318,523,28705,28782,28747,13,12,4119,13,13940,28832,13,3195,2118,1235,456,2007,5439,28804 diff --git a/sglang_output.txt b/sglang_output.txt new file mode 100644 index 000000000..e546c060d --- /dev/null +++ b/sglang_output.txt @@ -0,0 +1,11 @@ +│ │ │ { │ │ +│ │ │ │ 'text': ' Answer according to: For the given dictionary d with keys │ │ +│ │ being integers, trave'+208, │ │ +│ │ │ │ 'meta_info': { │ │ +│ │ │ │ │ 'id': '04224678f3a24f7abfafef26cd6d101f', │ │ +│ │ │ │ │ 'finish_reason': {'type': 'stop', 'matched': '\n'}, │ │ +│ │ │ │ │ 'prompt_tokens': 77, │ │ +│ │ │ │ │ 'completion_tokens': 58, │ │ +│ │ │ │ │ 'cached_tokens': 0 │ │ +│ │ │ │ } │ │ +│ │ │ }, \ No newline at end of file diff --git a/sglang_sampling_para.txt b/sglang_sampling_para.txt new file mode 100644 index 000000000..e4813cdca --- /dev/null +++ b/sglang_sampling_para.txt @@ -0,0 +1 @@ +{'top_p': 1.0, 'top_k': -1, 'max_new_tokens': 100, 'stop': ['\n', ''], 'temperature': 1.0, 'repetition_penalty': 1.0, 'skip_special_tokens': True, 'spaces_between_special_tokens': True} \ No newline at end of file diff --git a/src/lighteval/models/model_input.py b/src/lighteval/models/model_input.py index 687a561a3..bedc12b8d 100644 --- a/src/lighteval/models/model_input.py +++ b/src/lighteval/models/model_input.py @@ -120,7 +120,17 @@ def to_tgi_ie_dict(self) -> dict: } return {k: v for k, v in args.items() if v is not None} - # TODO first: sampling parameter - def to_sglang_dict(self) -> dict: - return {k: v for k, v in asdict(self).items() if v is not None} + # # TODO first: sampling parameter + # def to_sglang_dict(self) -> dict: + # args = { + # "max_new_tokens": self.max_new_tokens, + # "min_new_tokens": self.min_new_tokens, + # "stop_token_ids": self.stop_tokens, + # "temperature": self.temperature, + # "top_k": self.top_k, + # "top_p": self.top_p, + # "min_p": self.min_p, + # "repetition_penalty": self.repetition_penalty, + # } + # return {k: v for k, v in args.items() if v is not None} diff --git a/src/lighteval/models/sglang/sglang_model.py b/src/lighteval/models/sglang/sglang_model.py index 6b81d2a8f..b73d7bf67 100644 --- a/src/lighteval/models/sglang/sglang_model.py +++ b/src/lighteval/models/sglang/sglang_model.py @@ -51,7 +51,7 @@ from more_itertools import distribute from sglang import Engine from sglang.srt.hf_transformers_utils import get_tokenizer -from sglang.lang.ir import SglSamplingParams +from sglang.srt.sampling.sampling_params import SamplingParams from sglang.srt.distributed.parallel_state import destroy_model_parallel, destroy_distributed_environment # from vllm.distributed.parallel_state import destroy_distributed_environment, destroy_model_parallel @@ -141,7 +141,8 @@ def __init__( self.precision = _get_dtype(config.dtype, config=self._config) self.model_info = ModelInfo(model_name=self.model_name, model_sha=self.model_sha) - self.sampling_params = SglSamplingParams(**config.generation_parameters.to_sglang_dict()) + # self.sampling_params = SamplingParams(**config.generation_parameters.to_sglang_dict()) + self.sampling_params = dict() self.pairwise_tokenization = config.pairwise_tokenization @property @@ -211,7 +212,7 @@ def _create_auto_model(self, config: SGLANGModelConfig, env_config: EnvConfig) - "disable_cuda_graph_padding": config.disable_cuda_graph_padding, "context_length": self._max_length, "log_level": "info", - "return_token_ids": True, + # "return_token_ids": True, "revision": config.revision + (f"/{config.subfolder}" if config.subfolder is not None else ""), } @@ -297,20 +298,20 @@ def greedy_until( context_size = len(inputs[0]) # left truncate the inputs to the maximum length - if max_new_tokens is not None: - if context_size + max_new_tokens > self.max_length: - logger.warning( - f"{context_size + max_new_tokens=} which is greather than {self.max_length=}. Truncating context to {self.max_length - max_new_tokens} tokens." - ) - context_size = self.max_length - max_new_tokens - inputs = [input[-context_size:] for input in inputs] - else: - if context_size > self.max_length: - logger.warning( - f"{context_size=} which is greather than {self.max_length=}. Truncating context to {self.max_length} tokens." - ) - context_size = self.max_length - inputs = [input[-context_size:] for input in inputs] + # if max_new_tokens is not None: + # if context_size + max_new_tokens > self.max_length: + # logger.warning( + # f"{context_size + max_new_tokens=} which is greather than {self.max_length=}. Truncating context to {self.max_length - max_new_tokens} tokens." + # ) + # context_size = self.max_length - max_new_tokens + # inputs = [input[-context_size:] for input in inputs] + # else: + # if context_size > self.max_length: + # logger.warning( + # f"{context_size=} which is greather than {self.max_length=}. Truncating context to {self.max_length} tokens." + # ) + # context_size = self.max_length + # inputs = [input[-context_size:] for input in inputs] sglang_outputs = self._generate( inputs=inputs, @@ -347,18 +348,31 @@ def _generate( generate: bool = True, ) -> list[GenerativeResponse]: """Contains the actual logic of the generation.""" - sampling_params = self.sampling_params.clone() or SglSamplingParams() - if generate: - sampling_params.n = num_samples - sampling_params.max_tokens = max_new_tokens - sampling_params.stop = stop_tokens - sampling_params.logprobs = 1 if returns_logits else 0 - - else: - sampling_params.temperature = 0 - sampling_params.prompt_logprobs = 1 - sampling_params.max_tokens = 1 - sampling_params.detokenize = False + # TODO: double check without clone + # bug: params are wrong + # sampling_params = self.sampling_params + # if generate: + # sampling_params.n = num_samples + # sampling_params.max_tokens = max_new_tokens + # sampling_params.stop = stop_tokens + # sampling_params.logprobs = 1 if returns_logits else 0 + + # else: + # sampling_params.temperature = 0 + # sampling_params.prompt_logprobs = 1 + # sampling_params.max_tokens = 1 + # sampling_params.detokenize = False + + params = dict( + top_p=1.0, + top_k=-1, + max_new_tokens=max_new_tokens, + stop=stop_tokens, + temperature=1.0, + repetition_penalty=1.0, + skip_special_tokens=True, + spaces_between_special_tokens=True + ) ## Jayon02: how do sglang handle this # if self.data_parallel_size > 1: @@ -395,12 +409,19 @@ def _generate( # use_tqdm=True, # ) + # print(params) + # exit(0) + outputs = self.model.generate( - prompt_token_ids=inputs, - sampling_params=sampling_params, - use_tqdm=True, + input_ids=inputs, + sampling_params=params, ) + # outputs = self.model.generate( + # inputs, + # params, + # ) + return outputs def loglikelihood( diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index 9d666b4a5..b86850a6c 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -326,7 +326,6 @@ def _generate( sampling_params.max_tokens = 1 sampling_params.detokenize = False - ## Jayon02: how do sglang handle this if self.data_parallel_size > 1: # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote # also seems to only work with decorator and not with ray.remote() fn diff --git a/vllm_inputs_token.txt b/vllm_inputs_token.txt new file mode 100644 index 000000000..76bbf4434 --- /dev/null +++ b/vllm_inputs_token.txt @@ -0,0 +1,16 @@ +1,8789,13,1889,1984,15503,2472,5888,13,12,1270,1848,1783,28732,944,28725,2195,1148,1329,13,12,12,944,28723,8554,1148,327,2195,1148,13,1889,1984,3540,2472,28732,5183,15503,2472,1329,13,12,1270,1848,1783,6743,944,28725,1083,1148,1329,13,12,12,944,28723,1666,1148,327,1083,1148,13,28744,327,1984,15503,2472,28732,28782,28731,13,28724,327,1984,3540,2472,28732,28744,28723,8554,1148,28731,13,13940,28832,13,3195,349,272,1192,302,337,28723,1666,1148,438,272,948,302,456,2007,28804 +1,8789,13,28715,327,371,28740,28747,464,21558,647,28705,28750,28747,464,17664,647,28705,28770,28747,464,12684,3970,14491,13,1042,28730,327,9842,13,1392,613,297,2819,28732,2004,28732,28715,24770,13,12,335,613,1239,28705,28750,859,28705,28740,28747,13,12,12,1042,28730,2679,281,28792,28710,28793,13,13940,28832,13,3195,349,272,1192,302,1117,28730,1024,456,2007,4546,274,28804 +1,8789,13,1270,408,28732,28711,1329,13,12,335,307,859,28705,28740,28747,13,12,12,807,28705,28740,13,12,2013,28747,13,12,12,807,307,648,408,28732,28711,28733,28740,28731,13,28764,327,408,28732,28740,28734,28731,13,28724,327,408,6422,28740,28734,28731,13,13940,28832,13,3195,1212,302,5851,1235,456,2007,7072,28804 +1,8789,13,1889,1984,2472,5888,13,12,1270,1848,1783,6743,944,28725,2095,1329,13,12,12,944,28723,1148,327,2095,13,28744,327,1984,2472,28732,28782,28731,13,28724,327,1984,2472,28732,28744,28723,1148,28731,13,13940,28832,13,3195,349,272,1192,302,337,28723,1148,438,272,948,302,456,2007,28804 +1,8789,13,1889,1984,2472,5888,13,12,1270,1848,1783,6743,944,28725,2095,1329,13,12,12,944,28723,1148,327,2095,13,28744,327,1984,2472,28732,28782,28731,13,28724,327,1984,2472,28732,28744,28723,1148,28731,13,13940,28832,13,3195,349,272,1192,302,1318,28723,1148,438,272,948,302,456,2007,28804 +1,8789,13,1270,1369,28732,28744,1329,13,12,1392,613,297,2819,28732,28750,28725,1318,348,28732,28734,28723,28782,24770,13,12,12,335,1318,1239,716,28732,28710,28731,859,28705,28734,28747,13,12,12,12,807,8250,13,12,807,6110,13,13940,28832,13,3195,349,1369,28732,28740,28734,28740,11840 +1,8789,13,1270,285,28740,5888,13,12,807,1117,28732,28744,28731,648,464,21558,28742,13,1270,285,28750,5888,13,12,807,285,28740,28732,28750,28736,28744,28731,13,28744,327,285,28750,28732,28782,28750,28781,28731,13,13940,28832,13,3195,349,272,1192,302,1318,438,272,948,302,456,2007,28804 +1,8789,13,1270,285,28740,28732,28744,1329,13,12,28744,28792,28734,28793,2679,28705,28740,13,28724,327,285,28740,28732,28740,28731,13,28764,327,285,28750,5187,28740,2803,13,28712,327,285,28740,857,28775,1481,13,13940,28832,13,3195,1212,302,5851,1235,456,2007,7072,28804 +1,8789,13,1889,1984,2472,5888,13,12,1270,1848,1783,6743,944,28725,2095,1329,13,12,12,944,28723,1148,327,2095,13,28744,327,1984,2472,28732,28782,28731,13,28724,327,1984,2472,28732,28744,28723,1148,28731,13,13940,28832,13,3195,1212,302,1928,349,1318,28804 +1,8789,13,1801,327,28705,28734,13,1392,613,297,2819,28732,28740,28734,28734,28725,28705,28734,28725,387,28750,1329,13,12,1801,2679,613,13,13940,28832,13,3195,349,272,1192,302,1024,272,4008,727,1407,28705,28770,349,15731,28804 +1,8789,13,1270,18328,28732,28744,1329,13,12,807,1318,28736,28744,13,28724,327,18328,28732,28782,28731,13,28744,327,18328,28732,28724,28731,13,13940,28832,13,3195,349,272,1192,302,337,438,272,948,302,456,2007,28804 +1,8789,13,28744,327,28705,28782,13,28724,327,28705,28770,13,28764,327,28705,28787,13,28744,327,337,648,1318,13,13940,28832,13,3195,349,272,1192,302,1318,438,1407,28705,28750,28804 +1,8789,13,28744,327,28705,28740,13,335,1318,876,28705,28770,28747,13,12,28724,327,464,21558,28742,13,13940,28832,13,3195,349,272,1192,302,337,438,1407,28705,28781,28804 +1,8789,13,7841,25637,390,7494,13,28744,327,7494,28723,3506,385,28732,28740,28734,28731,13,13940,28832,13,3195,349,272,1192,302,1318,438,272,948,302,456,2007,28804 +1,8789,13,28724,327,1274,28732,2022,28732,501,28725,5936,28740,647,464,21558,647,464,28750,16433,13,13940,28832,13,3195,2118,1235,456,2007,7072,28804 +1,8789,13,335,1318,523,28705,28782,28747,13,12,4119,13,13940,28832,13,3195,2118,1235,456,2007,5439,28804 diff --git a/vllm_output.txt b/vllm_output.txt new file mode 100644 index 000000000..fab763088 --- /dev/null +++ b/vllm_output.txt @@ -0,0 +1 @@ +RequestOutput(request_id=0, prompt=None, prompt_token_ids=[1, 8789, 13, 1889, 1984, 15503, 2472, 5888, 13, 12, 1270, 1848, 1783, 28732, 944, 28725, 2195, 1148, 1329, 13, 12, 12, 944, 28723, 8554, 1148, 327, 2195, 1148, 13, 1889, 1984, 3540, 2472, 28732, 5183, 15503, 2472, 1329, 13, 12, 1270, 1848, 1783, 6743, 944, 28725, 1083, 1148, 1329, 13, 12, 12, 944, 28723, 1666, 1148, 327, 1083, 1148, 13, 28744, 327, 1984, 15503, 2472, 28732, 28782, 28731, 13, 28724, 327, 1984, 3540, 2472, 28732, 28744, 28723, 8554, 1148, 28731, 13, 13940, 28832, 13, 3195, 349, 272, 1192, 302, 337, 28723, 1666, 1148, 438, 272, 948, 302, 456, 2007, 28804], encoder_prompt=None, encoder_prompt_token_ids=None, prompt_logprobs=None, outputs=[CompletionOutput(index=0, text='', token_ids=(13,), cumulative_logprob=-0.047923602163791656, logprobs=[{13: Logprob(logprob=-0.047923602163791656, rank=1, decoded_token='\n')}], finish_reason=stop, stop_reason=)], finished=True, metrics=RequestMetrics(arrival_time=1738946941.3884528, last_token_time=1738946941.3884528, first_scheduled_time=1738946941.3913035, first_token_time=1738946941.6071706, time_in_queue=0.002850770950317383, finished_time=1738946941.608071, scheduler_time=0.001762479543685913, model_forward_time=None, model_execute_time=None), lora_request=None, num_cached_tokens=0) diff --git a/vllm_sampling_para.txt b/vllm_sampling_para.txt new file mode 100644 index 000000000..17aabb410 --- /dev/null +++ b/vllm_sampling_para.txt @@ -0,0 +1 @@ +SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=1.0, top_k=-1, min_p=0.0, seed=None, stop=['\n', ''], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=100, min_tokens=0, logprobs=0, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None) \ No newline at end of file From b23063be4435d6647f45a3f7932a3a096daac187 Mon Sep 17 00:00:00 2001 From: Qiujiang Chen <12012211@mail.sustech.edu.cn> Date: Sat, 8 Feb 2025 10:26:41 +0800 Subject: [PATCH 03/10] Update vllm_output.txt --- vllm_output.txt | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/vllm_output.txt b/vllm_output.txt index fab763088..d4fc354cb 100644 --- a/vllm_output.txt +++ b/vllm_output.txt @@ -1 +1,12 @@ -RequestOutput(request_id=0, prompt=None, prompt_token_ids=[1, 8789, 13, 1889, 1984, 15503, 2472, 5888, 13, 12, 1270, 1848, 1783, 28732, 944, 28725, 2195, 1148, 1329, 13, 12, 12, 944, 28723, 8554, 1148, 327, 2195, 1148, 13, 1889, 1984, 3540, 2472, 28732, 5183, 15503, 2472, 1329, 13, 12, 1270, 1848, 1783, 6743, 944, 28725, 1083, 1148, 1329, 13, 12, 12, 944, 28723, 1666, 1148, 327, 1083, 1148, 13, 28744, 327, 1984, 15503, 2472, 28732, 28782, 28731, 13, 28724, 327, 1984, 3540, 2472, 28732, 28744, 28723, 8554, 1148, 28731, 13, 13940, 28832, 13, 3195, 349, 272, 1192, 302, 337, 28723, 1666, 1148, 438, 272, 948, 302, 456, 2007, 28804], encoder_prompt=None, encoder_prompt_token_ids=None, prompt_logprobs=None, outputs=[CompletionOutput(index=0, text='', token_ids=(13,), cumulative_logprob=-0.047923602163791656, logprobs=[{13: Logprob(logprob=-0.047923602163791656, rank=1, decoded_token='\n')}], finish_reason=stop, stop_reason=)], finished=True, metrics=RequestMetrics(arrival_time=1738946941.3884528, last_token_time=1738946941.3884528, first_scheduled_time=1738946941.3913035, first_token_time=1738946941.6071706, time_in_queue=0.002850770950317383, finished_time=1738946941.608071, scheduler_time=0.001762479543685913, model_forward_time=None, model_execute_time=None), lora_request=None, num_cached_tokens=0) +RequestOutput(request_id=0, +prompt=None, +prompt_token_ids=[1, 8789, 13, 1889, 1984, 15503, 2472, 5888, 13, 12, 1270, 1848, 1783, 28732, 944, 28725, 2195, 1148, 1329, 13, 12, 12, 944, 28723, 8554, 1148, 327, 2195, 1148, 13, 1889, 1984, 3540, 2472, 28732, 5183, 15503, 2472, 1329, 13, 12, 1270, 1848, 1783, 6743, 944, 28725, 1083, 1148, 1329, 13, 12, 12, 944, 28723, 1666, 1148, 327, 1083, 1148, 13, 28744, 327, 1984, 15503, 2472, 28732, 28782, 28731, 13, 28724, 327, 1984, 3540, 2472, 28732, 28744, 28723, 8554, 1148, 28731, 13, 13940, 28832, 13, 3195, 349, 272, 1192, 302, 337, 28723, 1666, 1148, 438, 272, 948, 302, 456, 2007, 28804], +encoder_prompt=None, +encoder_prompt_token_ids=None, prompt_logprobs=None, +outputs=[CompletionOutput(index=0, text='', token_ids=(13,), + cumulative_logprob=-0.047923602163791656, + logprobs=[{13: Logprob(logprob=-0.047923602163791656, rank=1, decoded_token='\n')}], + finish_reason=stop, stop_reason=)], +finished=True, +metrics=RequestMetrics(arrival_time=1738946941.3884528, last_token_time=1738946941.3884528, first_scheduled_time=1738946941.3913035, first_token_time=1738946941.6071706, time_in_queue=0.002850770950317383, finished_time=1738946941.608071, scheduler_time=0.001762479543685913, model_forward_time=None, model_execute_time=None), +lora_request=None, num_cached_tokens=0) From 85954f26758badfc11cb4136f18c513862b9e396 Mon Sep 17 00:00:00 2001 From: Jayon02 <12012211@mail.sustech.edu.cn> Date: Sat, 8 Feb 2025 15:45:50 +0800 Subject: [PATCH 04/10] fix outputs bug --- src/lighteval/models/model_input.py | 1 - src/lighteval/models/sglang/sglang_model.py | 235 ++++---------------- src/lighteval/models/vllm/vllm_model.py | 2 +- 3 files changed, 45 insertions(+), 193 deletions(-) diff --git a/src/lighteval/models/model_input.py b/src/lighteval/models/model_input.py index bedc12b8d..575eb8024 100644 --- a/src/lighteval/models/model_input.py +++ b/src/lighteval/models/model_input.py @@ -120,7 +120,6 @@ def to_tgi_ie_dict(self) -> dict: } return {k: v for k, v in args.items() if v is not None} - # # TODO first: sampling parameter # def to_sglang_dict(self) -> dict: # args = { # "max_new_tokens": self.max_new_tokens, diff --git a/src/lighteval/models/sglang/sglang_model.py b/src/lighteval/models/sglang/sglang_model.py index b73d7bf67..a45efb9f5 100644 --- a/src/lighteval/models/sglang/sglang_model.py +++ b/src/lighteval/models/sglang/sglang_model.py @@ -82,36 +82,19 @@ @dataclass class SGLANGModelConfig: pretrained: str # - trust_remote_code: bool = True # + load_format: str = "auto" dtype: str = "auto" # - tensor_parallel_size: int = 1 # how many GPUs to use for tensor parallelism + tp_size: int = 1 # how many GPUs to use for tensor parallelism + dp_size: int = 1 # how many GPUs to use for data parallelism + context_length: int | None = None + random_seed: Optional[int] = None + trust_remote_code: bool = True # + chat_template: Optional[str] = None device: str = "cuda" - disable_radix_cache: bool = True - seed: int = 42 # - disable_cuda_graph: bool = True - disable_cuda_graph_padding: bool = True - max_model_length: int | None = None # maximum length of the model, ussually infered automatically. reduce this if you encouter OOM issues, 4096 is usually enough - return_token_ids: bool = True - - gpu_memory_utilisation: float = 0.9 # lower this if you are running out of memory - revision: str = "main" # revision of the model + skip_tokenizer_init: bool = False + kv_cache_dtype: str = "auto", + add_special_tokens: bool = True, pipeline_parallel_size: int = 1 # how many GPUs to use for pipeline parallelism - data_parallel_size: int = 1 # how many GPUs to use for data parallelism - swap_space: int = 4 # CPU swap space size (GiB) per GPU. - use_chat_template: bool = False - add_special_tokens: bool = True - multichoice_continuations_start_space: bool = ( - True # whether to add a space at the start of each continuation in multichoice generation - ) - pairwise_tokenization: bool = False # whether to tokenize the context and continuation separately or together. - generation_parameters: GenerationParameters = None # sampling parameters to use for generation - - subfolder: Optional[str] = None - - def __post_init__(self): - if not self.generation_parameters: - self.generation_parameters = GenerationParameters() - class SGLANGModel(LightevalModel): def __init__( @@ -121,30 +104,18 @@ def __init__( ): """Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation.""" self._config = config - self.use_chat_template = config.use_chat_template - self.data_parallel_size = int(config.data_parallel_size) - self.tensor_parallel_size = int(config.tensor_parallel_size) - - self._add_special_tokens = config.add_special_tokens if config.add_special_tokens is not None else False + self.use_chat_template = config.chat_template is not None + self.data_parallel_size = int(config.dp_size) + self.tensor_parallel_size = int(config.tp_size) + self._add_special_tokens = bool(config.add_special_tokens) self._tokenizer = self._create_auto_tokenizer(config, env_config) - - self._max_length = int(config.max_model_length) if config.max_model_length is not None else None - - # If model_parallel is not set we compare the number of processes with the number of GPUs + self._max_length = int(config.context_length) if config.context_length is not None else 256 self.model = self._create_auto_model(config, env_config) - - # self._device = config.accelerator.device if config.accelerator is not None else "cpu" - self.multichoice_continuations_start_space = config.multichoice_continuations_start_space - self.model_name = _simplify_name(config.pretrained) self.model_sha = "" # config.get_model_sha() self.precision = _get_dtype(config.dtype, config=self._config) - self.model_info = ModelInfo(model_name=self.model_name, model_sha=self.model_sha) - # self.sampling_params = SamplingParams(**config.generation_parameters.to_sglang_dict()) - self.sampling_params = dict() - self.pairwise_tokenization = config.pairwise_tokenization - + @property def tokenizer(self): return self._tokenizer @@ -155,8 +126,6 @@ def cleanup(self): del self.model.llm_engine.model_executor.driver_worker self.model = None gc.collect() - # TODO: check sglang dependency: ray flashinfer ray? - # ray.shutdown() destroy_distributed_environment() torch.cuda.empty_cache() @@ -186,43 +155,20 @@ def _create_auto_model(self, config: SGLANGModelConfig, env_config: EnvConfig) - Returns: transformers.PreTrainedModel: The created auto model instance. """ - # self.model_args = { - # "model": config.pretrained, - # "gpu_memory_utilization": float(config.gpu_memory_utilisation), - # "revision": config.revision + (f"/{config.subfolder}" if config.subfolder is not None else ""), - # "dtype": config.dtype, - # "trust_remote_code": config.trust_remote_code, - # "tensor_parallel_size": int(config.tensor_parallel_size), - # "pipeline_parallel_size": int(config.pipeline_parallel_size), - # "max_model_len": self._max_length, - # "swap_space": 4, - # "seed": 1234, - # } - # TODO: double check self.model_args = { "model_path": config.pretrained, "trust_remote_code": config.trust_remote_code, "dtype": config.dtype, - "tp_size": int(config.tensor_parallel_size), "device": "cuda", - "disable_radix_cache": config.disable_radix_cache, - "random_seed": config.seed, - "disable_cuda_graph": config.disable_cuda_graph, - "disable_cuda_graph_padding": config.disable_cuda_graph_padding, - "context_length": self._max_length, + "random_seed": config.random_seed, + "load_format": config.load_format, + "context_length": int(self._max_length) if self._max_length else None, + "dp_size": int(config.dp_size), + "tp_size": int(config.tp_size), "log_level": "info", - # "return_token_ids": True, - - "revision": config.revision + (f"/{config.subfolder}" if config.subfolder is not None else ""), } - # TODO: double check - # if int(config.data_parallel_size) > 1: - # self.model_args["distributed_executor_backend"] = "ray" - # self._batch_size = "auto" - # return None - model = Engine(**self.model_args) # TODO: double check @@ -239,7 +185,7 @@ def _create_auto_tokenizer(self, config: SGLANGModelConfig, env_config: EnvConfi config.pretrained, tokenizer_mode="auto", trust_remote_code=config.trust_remote_code, - tokenizer_revision=config.revision, + tokenizer_revision="main", ) tokenizer.pad_token = tokenizer.eos_token return tokenizer @@ -294,34 +240,36 @@ def greedy_until( # of losing some meaning, or have some generations that are exceedingly short? # The choice we go for here is to avoid truncating the prompt if we can, since it # should have been managed by the prompt creator/few shot manager if requested by the user. + inputs = tokenized["input_ids"] context_size = len(inputs[0]) # left truncate the inputs to the maximum length - # if max_new_tokens is not None: - # if context_size + max_new_tokens > self.max_length: - # logger.warning( - # f"{context_size + max_new_tokens=} which is greather than {self.max_length=}. Truncating context to {self.max_length - max_new_tokens} tokens." - # ) - # context_size = self.max_length - max_new_tokens - # inputs = [input[-context_size:] for input in inputs] - # else: - # if context_size > self.max_length: - # logger.warning( - # f"{context_size=} which is greather than {self.max_length=}. Truncating context to {self.max_length} tokens." - # ) - # context_size = self.max_length - # inputs = [input[-context_size:] for input in inputs] + if max_new_tokens is not None: + if context_size + max_new_tokens > self.max_length: + logger.warning( + f"{context_size + max_new_tokens=} which is greather than {self.max_length=}. Truncating context to {self.max_length - max_new_tokens} tokens." + ) + context_size = self.max_length - max_new_tokens + inputs = [input[-context_size:] for input in inputs] + else: + if context_size > self.max_length: + logger.warning( + f"{context_size=} which is greather than {self.max_length=}. Truncating context to {self.max_length} tokens." + ) + context_size = self.max_length + inputs = [input[-context_size:] for input in inputs] sglang_outputs = self._generate( inputs=inputs, max_new_tokens=max_new_tokens, stop_tokens=stop_tokens, - returns_logits=returns_logits, num_samples=num_samples, ) for sglang_output in sglang_outputs: + print(sglang_output) + exit(0) output_token_ids = [outputs.token_ids for outputs in sglang_output.outputs] logprobs = [output.logprobs for output in sglang_output.outputs] or [] logprobs = [logprob[token_id].logprob for token_id, logprob in zip(output_token_ids[0], logprobs[0])] @@ -343,132 +291,37 @@ def _generate( inputs: list[list[int]], max_new_tokens: Optional[int] = None, stop_tokens: Optional[list[str]] = None, - returns_logits: Optional[bool] = False, num_samples: int = 1, generate: bool = True, ) -> list[GenerativeResponse]: """Contains the actual logic of the generation.""" # TODO: double check without clone - # bug: params are wrong # sampling_params = self.sampling_params - # if generate: - # sampling_params.n = num_samples - # sampling_params.max_tokens = max_new_tokens - # sampling_params.stop = stop_tokens - # sampling_params.logprobs = 1 if returns_logits else 0 - - # else: - # sampling_params.temperature = 0 - # sampling_params.prompt_logprobs = 1 - # sampling_params.max_tokens = 1 - # sampling_params.detokenize = False - + params = dict( top_p=1.0, top_k=-1, + min_p=0, max_new_tokens=max_new_tokens, - stop=stop_tokens, + # stop=stop_tokens, temperature=1.0, repetition_penalty=1.0, skip_special_tokens=True, spaces_between_special_tokens=True ) - ## Jayon02: how do sglang handle this - # if self.data_parallel_size > 1: - # # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote - # # also seems to only work with decorator and not with ray.remote() fn - # # see https://github.com/vllm-project/vllm/issues/973 - # # note: this has changed on 0.3.3, and it only works now if num_gpus are set. - # # but then tensor_parallel breaks - # # Hynek: With the newest vllm, it actually breaks when tensor_parallel_size == 1 and num_gpus not set, - # # as VLLM complains about no GPUs available. - # @ray.remote(num_gpus=1 if self.tensor_parallel_size == 1 else None) - # def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, requests): - # llm = LLM(**model_args) - # return llm.generate(prompt_token_ids=requests, sampling_params=sampling_params) - # - # # dispatch requests to all self.data_parallel_size workers, in interleaved fashion - # # interleaved important to balance context lengths across workers - # requests = [list(x) for x in distribute(self.data_parallel_size, inputs)] - # inputs = ((self.model_args, sampling_params, req) for req in requests) - # object_refs = [run_inference_one_model.remote(*x) for x in inputs] - # results = ray.get(object_refs) - # # Invoke ray.shutdown() to prevent hang-ups if subsequent calls required. - # ray.shutdown() - # # flatten results - # outputs = [ - # x - # for x in itertools.chain.from_iterable(itertools.zip_longest(*[list(x) for x in results])) - # if x is not None - # ] - # else: - # outputs = self.model.generate( - # prompt_token_ids=inputs, - # sampling_params=sampling_params, - # use_tqdm=True, - # ) - - # print(params) - # exit(0) - outputs = self.model.generate( input_ids=inputs, sampling_params=params, + return_logprob=True, ) - # outputs = self.model.generate( - # inputs, - # params, - # ) - return outputs def loglikelihood( self, requests: list[LoglikelihoodRequest], override_bs: Optional[int] = None ) -> list[LoglikelihoodResponse]: - for request in requests: - if request.context == "": - request.tokenized_context = [self.tokenizer.eos_token_id] - request.tokenized_continuation = self.tok_encode(request.choice) - else: - # The following line is mandatory for compatibility with the harness - request.tokenized_context, request.tokenized_continuation = self.tok_encode_pair( - request.context, request.choice, pairwise=self.pairwise_tokenization - ) - return self._loglikelihood_tokens(requests, override_bs=override_bs) - - def _loglikelihood_tokens( - self, - requests: list[LoglikelihoodRequest], - override_bs: int = -1, - return_bool_score: bool = True, - rolling: bool = False, - ) -> list[LoglikelihoodResponse]: - dataset = LoglikelihoodDataset(requests=requests, num_dataset_splits=1) - res = [] - - for _ in tqdm(dataset.splits_start_end_iterator()): - # the last token is an eos token, so we don't need to add it - inputs = [dataset[i].tokenized_context + dataset[i].tokenized_continuation for i in range(len(dataset))] - # Left truncate the inputs to the maximum length - inputs = [input[-self.max_length :] for input in inputs] - outputs = self._generate(inputs, generate=False) - - for output, input in zip(outputs, dataset): - continuation_logprobs = [] - for token, logprobs in zip(input.tokenized_continuation[::-1], output.prompt_logprobs[::-1]): - continuation_logprobs.append(logprobs[token]) - bool_score = all(logprob.rank == 1 for logprob in continuation_logprobs) - continuation_logprobs = [logprob.logprob for logprob in continuation_logprobs] - answer = LoglikelihoodResponse( - input_tokens=input.tokenized_context + input.tokenized_continuation, - generated_tokens=input.tokenized_continuation, - result=(sum(continuation_logprobs), bool_score if return_bool_score else None), - ) - res.append(answer) - - return dataset.get_original_order(res) + pass def loglikelihood_rolling(): pass diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index b86850a6c..cd493a980 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -48,7 +48,6 @@ logger = logging.getLogger(__name__) -## Jayon02: sglang with what dependency, ray? flashinfer? if is_vllm_available(): import ray from more_itertools import distribute @@ -287,6 +286,7 @@ def greedy_until( ) for vllm_output in vllm_outputs: + output_token_ids = [outputs.token_ids for outputs in vllm_output.outputs] logprobs = [output.logprobs for output in vllm_output.outputs] or [] logprobs = [logprob[token_id].logprob for token_id, logprob in zip(output_token_ids[0], logprobs[0])] From 985d366a790f26322161712f6f8fc1b1ba1cf222 Mon Sep 17 00:00:00 2001 From: Jayon02 <12012211@mail.sustech.edu.cn> Date: Sat, 8 Feb 2025 23:24:24 +0800 Subject: [PATCH 05/10] fix outputs bug --- src/lighteval/models/sglang/sglang_model.py | 37 ++++++++++++--------- src/lighteval/models/vllm/vllm_model.py | 1 - 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/src/lighteval/models/sglang/sglang_model.py b/src/lighteval/models/sglang/sglang_model.py index a45efb9f5..0a64cafcd 100644 --- a/src/lighteval/models/sglang/sglang_model.py +++ b/src/lighteval/models/sglang/sglang_model.py @@ -54,8 +54,6 @@ from sglang.srt.sampling.sampling_params import SamplingParams from sglang.srt.distributed.parallel_state import destroy_model_parallel, destroy_distributed_environment -# from vllm.distributed.parallel_state import destroy_distributed_environment, destroy_model_parallel - logging.getLogger("sglang").propagate = True logging.getLogger("sglang").handlers.clear() @@ -123,7 +121,7 @@ def tokenizer(self): def cleanup(self): destroy_model_parallel() if self.model is not None: - del self.model.llm_engine.model_executor.driver_worker + self.model.shutdown() self.model = None gc.collect() destroy_distributed_environment() @@ -169,6 +167,9 @@ def _create_auto_model(self, config: SGLANGModelConfig, env_config: EnvConfig) - "log_level": "info", } + if config.dp_size > 1: + pass + model = Engine(**self.model_args) # TODO: double check @@ -267,15 +268,17 @@ def greedy_until( num_samples=num_samples, ) - for sglang_output in sglang_outputs: - print(sglang_output) - exit(0) - output_token_ids = [outputs.token_ids for outputs in sglang_output.outputs] - logprobs = [output.logprobs for output in sglang_output.outputs] or [] - logprobs = [logprob[token_id].logprob for token_id, logprob in zip(output_token_ids[0], logprobs[0])] - result = [output.text for output in sglang_output.outputs] - input_token_ids = sglang_output.prompt_token_ids - + for i in range(len(sglang_outputs)): + sglang_output = sglang_outputs[i] + # print(sglang_output) + # exit(0) + meta_info = sglang_output["meta_info"] + output_token_logprobs = meta_info["output_token_logprobs"] + output_token_ids = [output[1] for output in output_token_logprobs] + logprobs = [output[0] for output in output_token_logprobs] + result = [sglang_output["text"]] + input_token_ids = inputs[i] + cur_response = GenerativeResponse( result=result, logits=logprobs, @@ -296,20 +299,24 @@ def _generate( ) -> list[GenerativeResponse]: """Contains the actual logic of the generation.""" # TODO: double check without clone - # sampling_params = self.sampling_params params = dict( top_p=1.0, top_k=-1, min_p=0, max_new_tokens=max_new_tokens, - # stop=stop_tokens, + stop=stop_tokens, temperature=1.0, repetition_penalty=1.0, skip_special_tokens=True, - spaces_between_special_tokens=True + spaces_between_special_tokens=True, + n=num_samples ) + if not generate: + params.temperature = 0 + params.max_tokens = 1 + outputs = self.model.generate( input_ids=inputs, sampling_params=params, diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index cd493a980..a052b97ba 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -286,7 +286,6 @@ def greedy_until( ) for vllm_output in vllm_outputs: - output_token_ids = [outputs.token_ids for outputs in vllm_output.outputs] logprobs = [output.logprobs for output in vllm_output.outputs] or [] logprobs = [logprob[token_id].logprob for token_id, logprob in zip(output_token_ids[0], logprobs[0])] From c395b1e4a61f74ca838f7f782f950b26d8a819f8 Mon Sep 17 00:00:00 2001 From: Chayenne Date: Mon, 10 Feb 2025 06:08:20 -0800 Subject: [PATCH 06/10] adjust precision --- sglang_inputs_token.txt | 16 -- sglang_output.txt | 11 -- sglang_sampling_para.txt | 1 - src/lighteval/__main__.py | 1 - src/lighteval/main_sglang.py | 5 - src/lighteval/main_vllm.py | 2 - src/lighteval/models/model_input.py | 19 +-- src/lighteval/models/sglang/sglang_model.py | 171 ++++++++++---------- src/lighteval/models/vllm/vllm_model.py | 12 +- src/lighteval/pipeline.py | 5 +- vllm_inputs_token.txt | 16 -- vllm_output.txt | 12 -- vllm_sampling_para.txt | 1 - 13 files changed, 97 insertions(+), 175 deletions(-) delete mode 100644 sglang_inputs_token.txt delete mode 100644 sglang_output.txt delete mode 100644 sglang_sampling_para.txt delete mode 100644 vllm_inputs_token.txt delete mode 100644 vllm_output.txt delete mode 100644 vllm_sampling_para.txt diff --git a/sglang_inputs_token.txt b/sglang_inputs_token.txt deleted file mode 100644 index 76bbf4434..000000000 --- a/sglang_inputs_token.txt +++ /dev/null @@ -1,16 +0,0 @@ -1,8789,13,1889,1984,15503,2472,5888,13,12,1270,1848,1783,28732,944,28725,2195,1148,1329,13,12,12,944,28723,8554,1148,327,2195,1148,13,1889,1984,3540,2472,28732,5183,15503,2472,1329,13,12,1270,1848,1783,6743,944,28725,1083,1148,1329,13,12,12,944,28723,1666,1148,327,1083,1148,13,28744,327,1984,15503,2472,28732,28782,28731,13,28724,327,1984,3540,2472,28732,28744,28723,8554,1148,28731,13,13940,28832,13,3195,349,272,1192,302,337,28723,1666,1148,438,272,948,302,456,2007,28804 -1,8789,13,28715,327,371,28740,28747,464,21558,647,28705,28750,28747,464,17664,647,28705,28770,28747,464,12684,3970,14491,13,1042,28730,327,9842,13,1392,613,297,2819,28732,2004,28732,28715,24770,13,12,335,613,1239,28705,28750,859,28705,28740,28747,13,12,12,1042,28730,2679,281,28792,28710,28793,13,13940,28832,13,3195,349,272,1192,302,1117,28730,1024,456,2007,4546,274,28804 -1,8789,13,1270,408,28732,28711,1329,13,12,335,307,859,28705,28740,28747,13,12,12,807,28705,28740,13,12,2013,28747,13,12,12,807,307,648,408,28732,28711,28733,28740,28731,13,28764,327,408,28732,28740,28734,28731,13,28724,327,408,6422,28740,28734,28731,13,13940,28832,13,3195,1212,302,5851,1235,456,2007,7072,28804 -1,8789,13,1889,1984,2472,5888,13,12,1270,1848,1783,6743,944,28725,2095,1329,13,12,12,944,28723,1148,327,2095,13,28744,327,1984,2472,28732,28782,28731,13,28724,327,1984,2472,28732,28744,28723,1148,28731,13,13940,28832,13,3195,349,272,1192,302,337,28723,1148,438,272,948,302,456,2007,28804 -1,8789,13,1889,1984,2472,5888,13,12,1270,1848,1783,6743,944,28725,2095,1329,13,12,12,944,28723,1148,327,2095,13,28744,327,1984,2472,28732,28782,28731,13,28724,327,1984,2472,28732,28744,28723,1148,28731,13,13940,28832,13,3195,349,272,1192,302,1318,28723,1148,438,272,948,302,456,2007,28804 -1,8789,13,1270,1369,28732,28744,1329,13,12,1392,613,297,2819,28732,28750,28725,1318,348,28732,28734,28723,28782,24770,13,12,12,335,1318,1239,716,28732,28710,28731,859,28705,28734,28747,13,12,12,12,807,8250,13,12,807,6110,13,13940,28832,13,3195,349,1369,28732,28740,28734,28740,11840 -1,8789,13,1270,285,28740,5888,13,12,807,1117,28732,28744,28731,648,464,21558,28742,13,1270,285,28750,5888,13,12,807,285,28740,28732,28750,28736,28744,28731,13,28744,327,285,28750,28732,28782,28750,28781,28731,13,13940,28832,13,3195,349,272,1192,302,1318,438,272,948,302,456,2007,28804 -1,8789,13,1270,285,28740,28732,28744,1329,13,12,28744,28792,28734,28793,2679,28705,28740,13,28724,327,285,28740,28732,28740,28731,13,28764,327,285,28750,5187,28740,2803,13,28712,327,285,28740,857,28775,1481,13,13940,28832,13,3195,1212,302,5851,1235,456,2007,7072,28804 -1,8789,13,1889,1984,2472,5888,13,12,1270,1848,1783,6743,944,28725,2095,1329,13,12,12,944,28723,1148,327,2095,13,28744,327,1984,2472,28732,28782,28731,13,28724,327,1984,2472,28732,28744,28723,1148,28731,13,13940,28832,13,3195,1212,302,1928,349,1318,28804 -1,8789,13,1801,327,28705,28734,13,1392,613,297,2819,28732,28740,28734,28734,28725,28705,28734,28725,387,28750,1329,13,12,1801,2679,613,13,13940,28832,13,3195,349,272,1192,302,1024,272,4008,727,1407,28705,28770,349,15731,28804 -1,8789,13,1270,18328,28732,28744,1329,13,12,807,1318,28736,28744,13,28724,327,18328,28732,28782,28731,13,28744,327,18328,28732,28724,28731,13,13940,28832,13,3195,349,272,1192,302,337,438,272,948,302,456,2007,28804 -1,8789,13,28744,327,28705,28782,13,28724,327,28705,28770,13,28764,327,28705,28787,13,28744,327,337,648,1318,13,13940,28832,13,3195,349,272,1192,302,1318,438,1407,28705,28750,28804 -1,8789,13,28744,327,28705,28740,13,335,1318,876,28705,28770,28747,13,12,28724,327,464,21558,28742,13,13940,28832,13,3195,349,272,1192,302,337,438,1407,28705,28781,28804 -1,8789,13,7841,25637,390,7494,13,28744,327,7494,28723,3506,385,28732,28740,28734,28731,13,13940,28832,13,3195,349,272,1192,302,1318,438,272,948,302,456,2007,28804 -1,8789,13,28724,327,1274,28732,2022,28732,501,28725,5936,28740,647,464,21558,647,464,28750,16433,13,13940,28832,13,3195,2118,1235,456,2007,7072,28804 -1,8789,13,335,1318,523,28705,28782,28747,13,12,4119,13,13940,28832,13,3195,2118,1235,456,2007,5439,28804 diff --git a/sglang_output.txt b/sglang_output.txt deleted file mode 100644 index e546c060d..000000000 --- a/sglang_output.txt +++ /dev/null @@ -1,11 +0,0 @@ -│ │ │ { │ │ -│ │ │ │ 'text': ' Answer according to: For the given dictionary d with keys │ │ -│ │ being integers, trave'+208, │ │ -│ │ │ │ 'meta_info': { │ │ -│ │ │ │ │ 'id': '04224678f3a24f7abfafef26cd6d101f', │ │ -│ │ │ │ │ 'finish_reason': {'type': 'stop', 'matched': '\n'}, │ │ -│ │ │ │ │ 'prompt_tokens': 77, │ │ -│ │ │ │ │ 'completion_tokens': 58, │ │ -│ │ │ │ │ 'cached_tokens': 0 │ │ -│ │ │ │ } │ │ -│ │ │ }, \ No newline at end of file diff --git a/sglang_sampling_para.txt b/sglang_sampling_para.txt deleted file mode 100644 index e4813cdca..000000000 --- a/sglang_sampling_para.txt +++ /dev/null @@ -1 +0,0 @@ -{'top_p': 1.0, 'top_k': -1, 'max_new_tokens': 100, 'stop': ['\n', ''], 'temperature': 1.0, 'repetition_penalty': 1.0, 'skip_special_tokens': True, 'spaces_between_special_tokens': True} \ No newline at end of file diff --git a/src/lighteval/__main__.py b/src/lighteval/__main__.py index 77312f57c..1e79188dd 100644 --- a/src/lighteval/__main__.py +++ b/src/lighteval/__main__.py @@ -64,7 +64,6 @@ app.command(rich_help_panel="Evaluation Backends")(lighteval.main_accelerate.accelerate) app.command(rich_help_panel="Evaluation Utils")(lighteval.main_baseline.baseline) app.command(rich_help_panel="Evaluation Backends")(lighteval.main_nanotron.nanotron) -# Jayon02: add vllm cmd app.command(rich_help_panel="Evaluation Backends")(lighteval.main_vllm.vllm) app.command(rich_help_panel="Evaluation Backends")(lighteval.main_sglang.sglang) app.add_typer( diff --git a/src/lighteval/main_sglang.py b/src/lighteval/main_sglang.py index 0c662a9dd..539867105 100644 --- a/src/lighteval/main_sglang.py +++ b/src/lighteval/main_sglang.py @@ -1,4 +1,3 @@ -# TODO: change to what? # MIT License # Copyright (c) 2024 The HuggingFace Team @@ -35,7 +34,6 @@ HELP_PANEL_NAME_3 = "Debug Parameters" HELP_PANEL_NAME_4 = "Modeling Parameters" -# TODO: change def sglang( # === general === model_args: Annotated[ @@ -117,7 +115,6 @@ def sglang( hub_results_org=results_org, ) - ## Jayon02: vllm pipeline parameter pipeline_params = PipelineParameters( launcher_type=ParallelismManager.SGLANG, env_config=env_config, @@ -132,7 +129,6 @@ def sglang( load_responses_from_details_date_id=load_responses_from_details_date_id, ) - ## Jayon02: support two ways to load model if model_args.endswith(".yaml"): with open(model_args, "r") as f: config = yaml.safe_load(f)["model"] @@ -144,7 +140,6 @@ def sglang( model_args_dict: dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in model_args.split(",")} model_config = SGLANGModelConfig(**model_args_dict) - pipeline = Pipeline( tasks=tasks, pipeline_parameters=pipeline_params, diff --git a/src/lighteval/main_vllm.py b/src/lighteval/main_vllm.py index 90132dcba..130b5ff56 100644 --- a/src/lighteval/main_vllm.py +++ b/src/lighteval/main_vllm.py @@ -116,7 +116,6 @@ def vllm( hub_results_org=results_org, ) - ## Jayon02: vllm pipeline parameter pipeline_params = PipelineParameters( launcher_type=ParallelismManager.VLLM, env_config=env_config, @@ -131,7 +130,6 @@ def vllm( load_responses_from_details_date_id=load_responses_from_details_date_id, ) - ## Jayon02: support two ways to load model if model_args.endswith(".yaml"): with open(model_args, "r") as f: config = yaml.safe_load(f)["model"] diff --git a/src/lighteval/models/model_input.py b/src/lighteval/models/model_input.py index 575eb8024..c91d9fbc9 100644 --- a/src/lighteval/models/model_input.py +++ b/src/lighteval/models/model_input.py @@ -120,16 +120,11 @@ def to_tgi_ie_dict(self) -> dict: } return {k: v for k, v in args.items() if v is not None} - # def to_sglang_dict(self) -> dict: - # args = { - # "max_new_tokens": self.max_new_tokens, - # "min_new_tokens": self.min_new_tokens, - # "stop_token_ids": self.stop_tokens, - # "temperature": self.temperature, - # "top_k": self.top_k, - # "top_p": self.top_p, - # "min_p": self.min_p, - # "repetition_penalty": self.repetition_penalty, - # } - # return {k: v for k, v in args.items() if v is not None} + def to_sglang_dict(self) -> dict: + args = { + "max_new_tokens": self.max_new_tokens, + "stop_token_ids": self.stop_tokens, + "temperature": self.temperature, + } + return {k: v for k, v in args.items() if v is not None} diff --git a/src/lighteval/models/sglang/sglang_model.py b/src/lighteval/models/sglang/sglang_model.py index 0a64cafcd..7bf3b999c 100644 --- a/src/lighteval/models/sglang/sglang_model.py +++ b/src/lighteval/models/sglang/sglang_model.py @@ -21,9 +21,10 @@ # SOFTWARE. import gc -import itertools import logging import os +import subprocess +import signal from dataclasses import dataclass from typing import Optional @@ -45,55 +46,47 @@ from lighteval.utils.imports import is_sglang_available from lighteval.utils.utils import EnvConfig, as_list - logger = logging.getLogger(__name__) -from more_itertools import distribute -from sglang import Engine -from sglang.srt.hf_transformers_utils import get_tokenizer -from sglang.srt.sampling.sampling_params import SamplingParams -from sglang.srt.distributed.parallel_state import destroy_model_parallel, destroy_distributed_environment - -logging.getLogger("sglang").propagate = True -logging.getLogger("sglang").handlers.clear() - -## Jayon02: sglang with what dependency, ray? flashinfer? -# if is_sglang_available(): -# from more_itertools import distribute -# from vllm import LLM, SamplingParams -# from vllm.distributed.parallel_state import destroy_distributed_environment, destroy_model_parallel -# from vllm.transformers_utils.tokenizer import get_tokenizer -# -# logging.getLogger("sglang").propagate = True -# logging.getLogger("sglang").handlers.clear() -# else: -# LLM = None -# SamplingParams = None -# get_tokenizer = None -# distribute = None +if is_sglang_available(): + from sglang import Engine + from sglang.srt.hf_transformers_utils import get_tokenizer + from sglang.srt.distributed.parallel_state import destroy_model_parallel, destroy_distributed_environment + + logging.getLogger("sglang").propagate = True + logging.getLogger("sglang").handlers.clear() +else: + Engine = None + get_tokenizer = None os.environ["TOKENIZERS_PARALLELISM"] = "false" STARTING_BATCH_SIZE = 512 -## change to all sglang config @dataclass class SGLANGModelConfig: - pretrained: str # + pretrained: str load_format: str = "auto" - dtype: str = "auto" # + dtype: str = "auto" tp_size: int = 1 # how many GPUs to use for tensor parallelism dp_size: int = 1 # how many GPUs to use for data parallelism context_length: int | None = None - random_seed: Optional[int] = None - trust_remote_code: bool = True # - chat_template: Optional[str] = None + random_seed: Optional[int] = 1234 + trust_remote_code: bool = False + chat_template: Optional[str] = None # no use + use_chat_template: bool = False device: str = "cuda" skip_tokenizer_init: bool = False - kv_cache_dtype: str = "auto", - add_special_tokens: bool = True, + kv_cache_dtype: str = "auto" + add_special_tokens: bool = True pipeline_parallel_size: int = 1 # how many GPUs to use for pipeline parallelism + generation_parameters: GenerationParameters = None + + def __post_init__(self): + if not self.generation_parameters: + self.generation_parameters = GenerationParameters() + class SGLANGModel(LightevalModel): def __init__( self, @@ -102,26 +95,57 @@ def __init__( ): """Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation.""" self._config = config - self.use_chat_template = config.chat_template is not None + self.use_chat_template = config.use_chat_template self.data_parallel_size = int(config.dp_size) self.tensor_parallel_size = int(config.tp_size) self._add_special_tokens = bool(config.add_special_tokens) self._tokenizer = self._create_auto_tokenizer(config, env_config) - self._max_length = int(config.context_length) if config.context_length is not None else 256 + self._max_length = int(config.context_length) if config.context_length is not None else None self.model = self._create_auto_model(config, env_config) self.model_name = _simplify_name(config.pretrained) self.model_sha = "" # config.get_model_sha() self.precision = _get_dtype(config.dtype, config=self._config) + self.sampling_params = config.generation_parameters.to_sglang_dict() self.model_info = ModelInfo(model_name=self.model_name, model_sha=self.model_sha) - + @property def tokenizer(self): return self._tokenizer def cleanup(self): + + def reap_children(signum, frame): + try: + while True: + pid, status = os.waitpid(-1, os.WNOHANG) + if pid == 0: + break + print(f"Reaped child process {pid} with status {status}") + except ChildProcessError: + pass + + signal.signal(signal.SIGCHLD, reap_children) + + destroy_model_parallel() if self.model is not None: self.model.shutdown() + result = subprocess.run(["nvidia-smi", "--query-compute-apps=pid,process_name,gpu_uuid", + "--format=csv,noheader,nounits"], capture_output=True, text=True) + lines = result.stdout.strip().split("\n") + target_pids = [] + + for line in lines: + parts = [p.strip() for p in line.split(",")] + if len(parts) < 2: + continue + pid, process_name = parts[:2] + if process_name == "sglang::scheduler": + target_pids.append(pid) + + for pid in target_pids: + os.kill(int(pid), 9) + self.model = None gc.collect() destroy_distributed_environment() @@ -136,23 +160,7 @@ def max_length(self) -> int: return self._max_length def _create_auto_model(self, config: SGLANGModelConfig, env_config: EnvConfig) -> Optional[Engine]: - """ - Creates an instance of the pretrained HF model. - Args: - pretrained (str): The name or path of the pretrained model. - revision (str): The revision of the model. - subfolder (Optional[str], optional): The subfolder within the model. Defaults to None. - max_memory (Optional[dict], optional): The maximum memory to allocate for the model per GPU. Defaults to None. - device_map (Optional[dict], optional): The device mapping for the model. Defaults to None. - torch_dtype (Optional[Union[str, torch.dtype]], optional): The torch data type for the model. Defaults to None. - quantization_config (Optional[Union[BitsAndBytesConfig, GPTQConfig]], optional): The quantization configuration for the model. Defaults to None. - trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False. - cache_dir (str, optional): The cache directory for the model. Defaults to "/scratch". - - Returns: - transformers.PreTrainedModel: The created auto model instance. - """ # TODO: double check self.model_args = { "model_path": config.pretrained, @@ -172,12 +180,8 @@ def _create_auto_model(self, config: SGLANGModelConfig, env_config: EnvConfig) - model = Engine(**self.model_args) - # TODO: double check - # If the max_length can't get extracted from the config, it will be inferred from the model - # Inferring from the tokenizer will cause vllm to bug for models with mismatches between model - # config and tk config, like mistralai/Mistral-7B-v0.1 - # if self._max_length is None: - # self._max_length = model.llm_engine.model_config.max_seq_len_to_capture + if self._max_length is None: + self._max_length = 8192 return model @@ -218,15 +222,12 @@ def greedy_until( total=dataset.num_dataset_splits, desc="Splits", position=0, - disable=False, # self.disable_tqdm, + disable=False, ): - # For chat models, generation stops with EOS token, so we don't need to specify stop tokens + if self.use_chat_template: stop_tokens = [] else: - # NOTE: we are assuming all items in a batch behave similarly (same - # stop_tokens and max_tokens genrated) which is not necessarily - # the case! Because of that we only use batch size of 1 stop_tokens = dataset[0].stop_sequence max_new_tokens = dataset[0].generation_size # could be none @@ -267,17 +268,13 @@ def greedy_until( stop_tokens=stop_tokens, num_samples=num_samples, ) - - for i in range(len(sglang_outputs)): - sglang_output = sglang_outputs[i] - # print(sglang_output) - # exit(0) + + for input_token_ids, sglang_output in zip(inputs, sglang_outputs): meta_info = sglang_output["meta_info"] output_token_logprobs = meta_info["output_token_logprobs"] output_token_ids = [output[1] for output in output_token_logprobs] logprobs = [output[0] for output in output_token_logprobs] result = [sglang_output["text"]] - input_token_ids = inputs[i] cur_response = GenerativeResponse( result=result, @@ -298,31 +295,27 @@ def _generate( generate: bool = True, ) -> list[GenerativeResponse]: """Contains the actual logic of the generation.""" - # TODO: double check without clone - - params = dict( - top_p=1.0, - top_k=-1, - min_p=0, - max_new_tokens=max_new_tokens, - stop=stop_tokens, - temperature=1.0, - repetition_penalty=1.0, - skip_special_tokens=True, - spaces_between_special_tokens=True, - n=num_samples - ) - - if not generate: - params.temperature = 0 - params.max_tokens = 1 + # TODO: double check + + self.sampling_params["stop"] = stop_tokens + self.sampling_params["n"] = num_samples + self.sampling_params["top_p"] = 1.0 + self.sampling_params["top_k"] = -1 + self.sampling_params["skip_special_tokens"] = True + + if generate: + self.sampling_params["temperature"] = 0.6 + self.sampling_params["max_new_tokens"] = max_new_tokens + else: + self.sampling_params["temperature"] = 0 + self.sampling_params["max_new_tokens"] = 1 outputs = self.model.generate( input_ids=inputs, - sampling_params=params, + sampling_params=self.sampling_params, return_logprob=True, ) - + return outputs def loglikelihood( diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index a052b97ba..051b088fe 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -71,7 +71,6 @@ STARTING_BATCH_SIZE = 512 -## change to all sglang config @dataclass class VLLMModelConfig: pretrained: str @@ -130,7 +129,7 @@ def __init__( self.model_info = ModelInfo(model_name=self.model_name, model_sha=self.model_sha) self.sampling_params = SamplingParams(**config.generation_parameters.to_vllm_openai_dict()) self.pairwise_tokenization = config.pairwise_tokenization - + @property def tokenizer(self): return self._tokenizer @@ -276,7 +275,7 @@ def greedy_until( ) context_size = self.max_length inputs = [input[-context_size:] for input in inputs] - + vllm_outputs = self._generate( inputs=inputs, max_new_tokens=max_new_tokens, @@ -284,14 +283,14 @@ def greedy_until( returns_logits=returns_logits, num_samples=num_samples, ) - + for vllm_output in vllm_outputs: output_token_ids = [outputs.token_ids for outputs in vllm_output.outputs] logprobs = [output.logprobs for output in vllm_output.outputs] or [] logprobs = [logprob[token_id].logprob for token_id, logprob in zip(output_token_ids[0], logprobs[0])] result = [output.text for output in vllm_output.outputs] input_token_ids = vllm_output.prompt_token_ids - + cur_response = GenerativeResponse( result=result, logits=logprobs, @@ -358,12 +357,13 @@ def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, r sampling_params=sampling_params, use_tqdm=True, ) - + return outputs def loglikelihood( self, requests: list[LoglikelihoodRequest], override_bs: Optional[int] = None ) -> list[LoglikelihoodResponse]: + for request in requests: if request.context == "": request.tokenized_context = [self.tokenizer.eos_token_id] diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py index 148c1849e..d71d6991a 100644 --- a/src/lighteval/pipeline.py +++ b/src/lighteval/pipeline.py @@ -151,7 +151,6 @@ def __init__( self.evaluation_tracker = evaluation_tracker self.accelerator, self.parallel_context = self._init_parallelism_manager() self.model = self._init_model(model_config, model) - self.evaluation_tracker.general_config_logger.log_model_info(self.model.model_info) self._init_tasks_and_requests(tasks=tasks) self._init_random_seeds() @@ -193,7 +192,6 @@ def _init_model(self, model_config, model): env_config=self.pipeline_parameters.env_config, ) else: - ## Jayon02: load model into vllm return load_model(config=model_config, env_config=self.pipeline_parameters.env_config) if isinstance(model, TransformersModel): @@ -213,10 +211,10 @@ def _init_tasks_and_requests(self, tasks: str): cache_dir=self.pipeline_parameters.env_config.cache_dir, custom_tasks=self.pipeline_parameters.custom_tasks_directory, ) + task_names_list, fewshots_dict = taskinfo_selector(tasks, registry) task_dict = registry.get_task_dict(task_names_list) LightevalTask.load_datasets(list(task_dict.values()), self.pipeline_parameters.dataset_loading_processes) - self.evaluation_tracker.task_config_logger.log(task_dict) requests, docs = create_requests_from_tasks( @@ -451,6 +449,7 @@ def _run_model(self): responses = run_model(requests, override_bs=self.pipeline_parameters.override_batch_size) # Storing the responses associated to the same samples together + for response, request in zip(responses, requests): for metric_category in request.metric_categories: sample_id = SampleUid(request.task_name, request.sample_index) diff --git a/vllm_inputs_token.txt b/vllm_inputs_token.txt deleted file mode 100644 index 76bbf4434..000000000 --- a/vllm_inputs_token.txt +++ /dev/null @@ -1,16 +0,0 @@ -1,8789,13,1889,1984,15503,2472,5888,13,12,1270,1848,1783,28732,944,28725,2195,1148,1329,13,12,12,944,28723,8554,1148,327,2195,1148,13,1889,1984,3540,2472,28732,5183,15503,2472,1329,13,12,1270,1848,1783,6743,944,28725,1083,1148,1329,13,12,12,944,28723,1666,1148,327,1083,1148,13,28744,327,1984,15503,2472,28732,28782,28731,13,28724,327,1984,3540,2472,28732,28744,28723,8554,1148,28731,13,13940,28832,13,3195,349,272,1192,302,337,28723,1666,1148,438,272,948,302,456,2007,28804 -1,8789,13,28715,327,371,28740,28747,464,21558,647,28705,28750,28747,464,17664,647,28705,28770,28747,464,12684,3970,14491,13,1042,28730,327,9842,13,1392,613,297,2819,28732,2004,28732,28715,24770,13,12,335,613,1239,28705,28750,859,28705,28740,28747,13,12,12,1042,28730,2679,281,28792,28710,28793,13,13940,28832,13,3195,349,272,1192,302,1117,28730,1024,456,2007,4546,274,28804 -1,8789,13,1270,408,28732,28711,1329,13,12,335,307,859,28705,28740,28747,13,12,12,807,28705,28740,13,12,2013,28747,13,12,12,807,307,648,408,28732,28711,28733,28740,28731,13,28764,327,408,28732,28740,28734,28731,13,28724,327,408,6422,28740,28734,28731,13,13940,28832,13,3195,1212,302,5851,1235,456,2007,7072,28804 -1,8789,13,1889,1984,2472,5888,13,12,1270,1848,1783,6743,944,28725,2095,1329,13,12,12,944,28723,1148,327,2095,13,28744,327,1984,2472,28732,28782,28731,13,28724,327,1984,2472,28732,28744,28723,1148,28731,13,13940,28832,13,3195,349,272,1192,302,337,28723,1148,438,272,948,302,456,2007,28804 -1,8789,13,1889,1984,2472,5888,13,12,1270,1848,1783,6743,944,28725,2095,1329,13,12,12,944,28723,1148,327,2095,13,28744,327,1984,2472,28732,28782,28731,13,28724,327,1984,2472,28732,28744,28723,1148,28731,13,13940,28832,13,3195,349,272,1192,302,1318,28723,1148,438,272,948,302,456,2007,28804 -1,8789,13,1270,1369,28732,28744,1329,13,12,1392,613,297,2819,28732,28750,28725,1318,348,28732,28734,28723,28782,24770,13,12,12,335,1318,1239,716,28732,28710,28731,859,28705,28734,28747,13,12,12,12,807,8250,13,12,807,6110,13,13940,28832,13,3195,349,1369,28732,28740,28734,28740,11840 -1,8789,13,1270,285,28740,5888,13,12,807,1117,28732,28744,28731,648,464,21558,28742,13,1270,285,28750,5888,13,12,807,285,28740,28732,28750,28736,28744,28731,13,28744,327,285,28750,28732,28782,28750,28781,28731,13,13940,28832,13,3195,349,272,1192,302,1318,438,272,948,302,456,2007,28804 -1,8789,13,1270,285,28740,28732,28744,1329,13,12,28744,28792,28734,28793,2679,28705,28740,13,28724,327,285,28740,28732,28740,28731,13,28764,327,285,28750,5187,28740,2803,13,28712,327,285,28740,857,28775,1481,13,13940,28832,13,3195,1212,302,5851,1235,456,2007,7072,28804 -1,8789,13,1889,1984,2472,5888,13,12,1270,1848,1783,6743,944,28725,2095,1329,13,12,12,944,28723,1148,327,2095,13,28744,327,1984,2472,28732,28782,28731,13,28724,327,1984,2472,28732,28744,28723,1148,28731,13,13940,28832,13,3195,1212,302,1928,349,1318,28804 -1,8789,13,1801,327,28705,28734,13,1392,613,297,2819,28732,28740,28734,28734,28725,28705,28734,28725,387,28750,1329,13,12,1801,2679,613,13,13940,28832,13,3195,349,272,1192,302,1024,272,4008,727,1407,28705,28770,349,15731,28804 -1,8789,13,1270,18328,28732,28744,1329,13,12,807,1318,28736,28744,13,28724,327,18328,28732,28782,28731,13,28744,327,18328,28732,28724,28731,13,13940,28832,13,3195,349,272,1192,302,337,438,272,948,302,456,2007,28804 -1,8789,13,28744,327,28705,28782,13,28724,327,28705,28770,13,28764,327,28705,28787,13,28744,327,337,648,1318,13,13940,28832,13,3195,349,272,1192,302,1318,438,1407,28705,28750,28804 -1,8789,13,28744,327,28705,28740,13,335,1318,876,28705,28770,28747,13,12,28724,327,464,21558,28742,13,13940,28832,13,3195,349,272,1192,302,337,438,1407,28705,28781,28804 -1,8789,13,7841,25637,390,7494,13,28744,327,7494,28723,3506,385,28732,28740,28734,28731,13,13940,28832,13,3195,349,272,1192,302,1318,438,272,948,302,456,2007,28804 -1,8789,13,28724,327,1274,28732,2022,28732,501,28725,5936,28740,647,464,21558,647,464,28750,16433,13,13940,28832,13,3195,2118,1235,456,2007,7072,28804 -1,8789,13,335,1318,523,28705,28782,28747,13,12,4119,13,13940,28832,13,3195,2118,1235,456,2007,5439,28804 diff --git a/vllm_output.txt b/vllm_output.txt deleted file mode 100644 index d4fc354cb..000000000 --- a/vllm_output.txt +++ /dev/null @@ -1,12 +0,0 @@ -RequestOutput(request_id=0, -prompt=None, -prompt_token_ids=[1, 8789, 13, 1889, 1984, 15503, 2472, 5888, 13, 12, 1270, 1848, 1783, 28732, 944, 28725, 2195, 1148, 1329, 13, 12, 12, 944, 28723, 8554, 1148, 327, 2195, 1148, 13, 1889, 1984, 3540, 2472, 28732, 5183, 15503, 2472, 1329, 13, 12, 1270, 1848, 1783, 6743, 944, 28725, 1083, 1148, 1329, 13, 12, 12, 944, 28723, 1666, 1148, 327, 1083, 1148, 13, 28744, 327, 1984, 15503, 2472, 28732, 28782, 28731, 13, 28724, 327, 1984, 3540, 2472, 28732, 28744, 28723, 8554, 1148, 28731, 13, 13940, 28832, 13, 3195, 349, 272, 1192, 302, 337, 28723, 1666, 1148, 438, 272, 948, 302, 456, 2007, 28804], -encoder_prompt=None, -encoder_prompt_token_ids=None, prompt_logprobs=None, -outputs=[CompletionOutput(index=0, text='', token_ids=(13,), - cumulative_logprob=-0.047923602163791656, - logprobs=[{13: Logprob(logprob=-0.047923602163791656, rank=1, decoded_token='\n')}], - finish_reason=stop, stop_reason=)], -finished=True, -metrics=RequestMetrics(arrival_time=1738946941.3884528, last_token_time=1738946941.3884528, first_scheduled_time=1738946941.3913035, first_token_time=1738946941.6071706, time_in_queue=0.002850770950317383, finished_time=1738946941.608071, scheduler_time=0.001762479543685913, model_forward_time=None, model_execute_time=None), -lora_request=None, num_cached_tokens=0) diff --git a/vllm_sampling_para.txt b/vllm_sampling_para.txt deleted file mode 100644 index 17aabb410..000000000 --- a/vllm_sampling_para.txt +++ /dev/null @@ -1 +0,0 @@ -SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=1.0, top_k=-1, min_p=0.0, seed=None, stop=['\n', ''], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=100, min_tokens=0, logprobs=0, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None) \ No newline at end of file From 9c0f7cb0c7625824e93ce70505d73ae69e0927ac Mon Sep 17 00:00:00 2001 From: qiujiang chen Date: Wed, 12 Feb 2025 05:26:05 +0000 Subject: [PATCH 07/10] adjust precision --- src/lighteval/models/sglang/sglang_model.py | 39 ++------------------- src/lighteval/models/vllm/vllm_model.py | 3 -- 2 files changed, 2 insertions(+), 40 deletions(-) diff --git a/src/lighteval/models/sglang/sglang_model.py b/src/lighteval/models/sglang/sglang_model.py index 7bf3b999c..52cb2ec29 100644 --- a/src/lighteval/models/sglang/sglang_model.py +++ b/src/lighteval/models/sglang/sglang_model.py @@ -73,7 +73,6 @@ class SGLANGModelConfig: context_length: int | None = None random_seed: Optional[int] = 1234 trust_remote_code: bool = False - chat_template: Optional[str] = None # no use use_chat_template: bool = False device: str = "cuda" skip_tokenizer_init: bool = False @@ -112,39 +111,10 @@ def __init__( def tokenizer(self): return self._tokenizer - def cleanup(self): - - def reap_children(signum, frame): - try: - while True: - pid, status = os.waitpid(-1, os.WNOHANG) - if pid == 0: - break - print(f"Reaped child process {pid} with status {status}") - except ChildProcessError: - pass - - signal.signal(signal.SIGCHLD, reap_children) - - + def cleanup(self): destroy_model_parallel() if self.model is not None: self.model.shutdown() - result = subprocess.run(["nvidia-smi", "--query-compute-apps=pid,process_name,gpu_uuid", - "--format=csv,noheader,nounits"], capture_output=True, text=True) - lines = result.stdout.strip().split("\n") - target_pids = [] - - for line in lines: - parts = [p.strip() for p in line.split(",")] - if len(parts) < 2: - continue - pid, process_name = parts[:2] - if process_name == "sglang::scheduler": - target_pids.append(pid) - - for pid in target_pids: - os.kill(int(pid), 9) self.model = None gc.collect() @@ -275,7 +245,6 @@ def greedy_until( output_token_ids = [output[1] for output in output_token_logprobs] logprobs = [output[0] for output in output_token_logprobs] result = [sglang_output["text"]] - cur_response = GenerativeResponse( result=result, logits=logprobs, @@ -296,26 +265,22 @@ def _generate( ) -> list[GenerativeResponse]: """Contains the actual logic of the generation.""" # TODO: double check - self.sampling_params["stop"] = stop_tokens self.sampling_params["n"] = num_samples self.sampling_params["top_p"] = 1.0 self.sampling_params["top_k"] = -1 self.sampling_params["skip_special_tokens"] = True + self.sampling_params["temperature"] = 0 if generate: - self.sampling_params["temperature"] = 0.6 self.sampling_params["max_new_tokens"] = max_new_tokens else: - self.sampling_params["temperature"] = 0 self.sampling_params["max_new_tokens"] = 1 - outputs = self.model.generate( input_ids=inputs, sampling_params=self.sampling_params, return_logprob=True, ) - return outputs def loglikelihood( diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index 051b088fe..247e477d1 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -290,7 +290,6 @@ def greedy_until( logprobs = [logprob[token_id].logprob for token_id, logprob in zip(output_token_ids[0], logprobs[0])] result = [output.text for output in vllm_output.outputs] input_token_ids = vllm_output.prompt_token_ids - cur_response = GenerativeResponse( result=result, logits=logprobs, @@ -357,13 +356,11 @@ def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, r sampling_params=sampling_params, use_tqdm=True, ) - return outputs def loglikelihood( self, requests: list[LoglikelihoodRequest], override_bs: Optional[int] = None ) -> list[LoglikelihoodResponse]: - for request in requests: if request.context == "": request.tokenized_context = [self.tokenizer.eos_token_id] From b733c86686b1d1f15ad4f210fb2b36d232d70bf4 Mon Sep 17 00:00:00 2001 From: Jayon02 <12012211@mail.sustech.edu.cn> Date: Sat, 15 Feb 2025 11:23:39 +0000 Subject: [PATCH 08/10] Squashed commit of the following: commit 132290b571b7470ffec04461ce504eb352b031a3 Author: Jayon02 <12012211@mail.sustech.edu.cn> Date: Sat Feb 15 11:08:24 2025 +0000 modify document commit 601a75504be8320703af401eaddce2ceef40f3ca Author: Jayon02 <12012211@mail.sustech.edu.cn> Date: Sat Feb 15 10:22:43 2025 +0000 pass pre commit check and modify document commit 3e1fb8899c073622f76c14b7a12c392cfe426a37 Author: qiujiang chen Date: Sat Feb 15 06:59:12 2025 +0000 optimize input, adjust precision commit 1a590760aa5dd188d2e31e3f6ee316433004614d Author: qiujiang chen Date: Thu Feb 13 19:51:22 2025 +0000 text files commit 9dc62b7f55a190e42cd46016a12358c1f1b78c93 Author: qiujiang chen Date: Wed Feb 12 14:08:21 2025 +0000 modify format --- docs/source/_toctree.yml | 2 + docs/source/installation.mdx | 3 + docs/source/use-sglang-as-backend.mdx | 52 ++++++++ src/lighteval/__main__.py | 2 +- src/lighteval/main_sglang.py | 25 +++- src/lighteval/main_vllm.py | 1 - src/lighteval/models/model_input.py | 29 ++-- src/lighteval/models/model_loader.py | 17 +-- src/lighteval/models/sglang/sglang_model.py | 138 +++++++++++++------- src/lighteval/models/vllm/vllm_model.py | 10 +- src/lighteval/pipeline.py | 9 +- src/lighteval/tasks/registry.py | 1 - src/lighteval/utils/imports.py | 5 +- 13 files changed, 211 insertions(+), 83 deletions(-) create mode 100644 docs/source/use-sglang-as-backend.mdx diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 9ad55466a..2a56f1eb5 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -17,6 +17,8 @@ title: Add a custom metric - local: use-vllm-as-backend title: Use VLLM as backend + - local: use-sglang-as-backend + title: Use SGLang as backend - local: evaluate-the-model-on-a-server-or-container title: Evaluate on Server - local: contributing-to-multilingual-evaluations diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index 542c09752..a57bcf6b7 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -23,6 +23,8 @@ Lighteval has optional dependencies that you can install by specifying the appropriate extras group. `pip install lighteval[]` or `pip install -e .[]`. +If you want to use lighteval with `sglang`, try to follow [sglang install documentation](https://docs.sglang.ai/start/install.html). + | extra name | description | |--------------|---------------------------------------------------------------------------| | tgi | To use Text Generation Inference API to evaluate your model | @@ -31,6 +33,7 @@ appropriate extras group. | adapters | To evaluate adapters models (delta and peft) | | tensorboardX | To upload your results to tensorboard | | vllm | To use vllm as backend for inference | +| sglang | To use sglang as backend for inference | | s3 | To upload results to s3 | diff --git a/docs/source/use-sglang-as-backend.mdx b/docs/source/use-sglang-as-backend.mdx new file mode 100644 index 000000000..2edcaeeb5 --- /dev/null +++ b/docs/source/use-sglang-as-backend.mdx @@ -0,0 +1,52 @@ +# Use SGLang as backend + +Lighteval allows you to use `sglang` as backend allowing great speedups. +To use, simply change the `model_args` to reflect the arguments you want to pass to sglang. + +```bash +lighteval sglang \ + "pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16" \ + "leaderboard|truthfulqa:mc|0|0" +``` + +`sglang` is able to distribute the model across multiple GPUs using data +parallelism and tensor parallelism. +You can choose the parallelism method by setting in the the `model_args`. + +For example if you have 4 GPUs you can split it across using `tensor_parallelism`: + +```bash +lighteval sglang \ + "pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16,tp_size=4" \ + "leaderboard|truthfulqa:mc|0|0" +``` + +Or, if your model fits on a single GPU, you can use `data_parallelism` to speed up the evaluation: + +```bash +lighteval sglang \ + "pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16,dp_size=4" \ + "leaderboard|truthfulqa:mc|0|0" +``` + +Available arguments for `sglang` can be found in the `SGLangModelConfig`: + +- **pretrained** (str): HuggingFace Hub model ID name or the path to a pre-trained model to load. +- **load_format** (str): The format the weights are loaded in. Defaults to *.safetensors/*.bin. +- **dtype** (str): Dtype used for the model, defaults to bfloat16. +- **tp_size** (int): The number of GPUs the model weights get sharded over. +- **dp_size** (int): The number of data-parallel copies of the model. +- **context_length** (int | None): The number of tokens our model can process including the input. +- **random_seed** (int): Can be used to enforce more deterministic behavior. +- **trust_remote_code** (bool): If True, will use locally cached config files, otherwise use remote configs in HuggingFace. +- **skip_tokenizer_init** (bool): Set to true to provide the tokens to the engine and get the output tokens directly, typically used in RLHF. +- **kv_cache_dtype** (str): Dtype of the kv cache, defaults to the auto. +- **add_special_tokens** (bool): Whether to add special tokens to the input sequences. +- **sampling_backend** (str | None): The backend for sampling. +- **attention_backend** (str | None): The backend for attention computation and KV cache management. +- **mem_fraction_static** (float): Fraction of the free GPU memory used for static memory like model weights and KV cache. +- **chunked_prefill_size** (int): Perform the prefill in chunks of these size. + +> [!WARNING] +> In the case of OOM issues, you might need to reduce the context size of the +> model as well as reduce the `mem_fraction_static` and `chunked_prefill_size` parameter. diff --git a/src/lighteval/__main__.py b/src/lighteval/__main__.py index 1e79188dd..eece09f4c 100644 --- a/src/lighteval/__main__.py +++ b/src/lighteval/__main__.py @@ -29,9 +29,9 @@ import lighteval.main_baseline import lighteval.main_endpoint import lighteval.main_nanotron +import lighteval.main_sglang import lighteval.main_tasks import lighteval.main_vllm -import lighteval.main_sglang app = typer.Typer() diff --git a/src/lighteval/main_sglang.py b/src/lighteval/main_sglang.py index 539867105..a49840abe 100644 --- a/src/lighteval/main_sglang.py +++ b/src/lighteval/main_sglang.py @@ -1,6 +1,6 @@ # MIT License -# Copyright (c) 2024 The HuggingFace Team +# Copyright (c) 2024 The SGLang Team # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -19,7 +19,9 @@ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import json import os +import re from typing import Optional from typer import Argument, Option @@ -34,6 +36,7 @@ HELP_PANEL_NAME_3 = "Debug Parameters" HELP_PANEL_NAME_4 = "Modeling Parameters" + def sglang( # === general === model_args: Annotated[ @@ -99,7 +102,7 @@ def sglang( from lighteval.logging.evaluation_tracker import EvaluationTracker from lighteval.models.model_input import GenerationParameters - from lighteval.models.sglang.sglang_model import SGLANGModelConfig + from lighteval.models.sglang.sglang_model import SGLangModelConfig from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters TOKEN = os.getenv("HF_TOKEN") @@ -133,13 +136,23 @@ def sglang( with open(model_args, "r") as f: config = yaml.safe_load(f)["model"] generation_parameters = GenerationParameters.from_dict(config) - model_config = SGLANGModelConfig(config, generation_parameters=generation_parameters) + model_config = SGLangModelConfig(config, generation_parameters=generation_parameters) else: - ## cmd arg - model_args_dict: dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in model_args.split(",")} + pattern = re.compile(r"(\w+)=(\{.*\}|[^,]+)") + matches = pattern.findall(model_args) + model_args_dict = {} + generation_params = None + for key, value in matches: + key = key.strip() + if key == "generation_parameters": + value = re.sub(r"(\w+):", r'"\1":', value) + value = json.loads(value) + generation_params = GenerationParameters(**value) + else: + model_args_dict[key] = value + model_config = SGLangModelConfig(**model_args_dict, generation_parameters=generation_params) - model_config = SGLANGModelConfig(**model_args_dict) pipeline = Pipeline( tasks=tasks, pipeline_parameters=pipeline_params, diff --git a/src/lighteval/main_vllm.py b/src/lighteval/main_vllm.py index 130b5ff56..d063c3fa8 100644 --- a/src/lighteval/main_vllm.py +++ b/src/lighteval/main_vllm.py @@ -137,7 +137,6 @@ def vllm( model_config = VLLMModelConfig(config, generation_parameters=generation_parameters) else: - ## cmd arg model_args_dict: dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in model_args.split(",")} model_config = VLLMModelConfig(**model_args_dict) diff --git a/src/lighteval/models/model_input.py b/src/lighteval/models/model_input.py index c91d9fbc9..21e8496a4 100644 --- a/src/lighteval/models/model_input.py +++ b/src/lighteval/models/model_input.py @@ -27,20 +27,20 @@ @dataclass class GenerationParameters: early_stopping: Optional[bool] = None # vllm, transformers - repetition_penalty: Optional[float] = None # vllm, transformers, tgi - frequency_penalty: Optional[float] = None # vllm, tgi + repetition_penalty: Optional[float] = None # vllm, transformers, tgi, sglang + frequency_penalty: Optional[float] = None # vllm, tgi, sglang length_penalty: Optional[float] = None # vllm, transformers - presence_penalty: Optional[float] = None # vllm + presence_penalty: Optional[float] = None # vllm, sglang - max_new_tokens: Optional[int] = None # vllm, transformers, tgi - min_new_tokens: Optional[int] = None # vllm, transformers + max_new_tokens: Optional[int] = None # vllm, transformers, tgi, sglang + min_new_tokens: Optional[int] = None # vllm, transformers, sglang seed: Optional[int] = None # vllm, tgi - stop_tokens: Optional[list[str]] = None # vllm, transformers, tgi - temperature: Optional[float] = None # vllm, transformers, tgi - top_k: Optional[int] = None # vllm, transformers, tgi - min_p: Optional[float] = None # vllm, transformers - top_p: Optional[int] = None # vllm, transformers, tgi + stop_tokens: Optional[list[str]] = None # vllm, transformers, tgi, sglang + temperature: Optional[float] = None # vllm, transformers, tgi, sglang + top_k: Optional[int] = None # vllm, transformers, tgi, sglang + min_p: Optional[float] = None # vllm, transformers, sglang + top_p: Optional[int] = None # vllm, transformers, tgi, sglang truncate_prompt: Optional[bool] = None # vllm, tgi @classmethod @@ -125,6 +125,13 @@ def to_sglang_dict(self) -> dict: "max_new_tokens": self.max_new_tokens, "stop_token_ids": self.stop_tokens, "temperature": self.temperature, + "stop": self.stop_tokens, + "top_p": self.top_p, + "top_k": self.top_k, + "min_p": self.min_p, + "frequency_penalty": self.frequency_penalty, + "presence_penalty": self.presence_penalty, + "repetition_penalty": self.repetition_penalty, + "min_new_tokens": self.min_new_tokens, } return {k: v for k, v in args.items() if v is not None} - diff --git a/src/lighteval/models/model_loader.py b/src/lighteval/models/model_loader.py index 3bf76af7d..ebcd9d3bb 100644 --- a/src/lighteval/models/model_loader.py +++ b/src/lighteval/models/model_loader.py @@ -32,19 +32,21 @@ from lighteval.models.endpoints.openai_model import OpenAIClient, OpenAIModelConfig from lighteval.models.endpoints.tgi_model import ModelClient, TGIModelConfig from lighteval.models.litellm_model import LiteLLMClient, LiteLLMModelConfig -from lighteval.models.sglang.sglang_model import SGLANGModelConfig, SGLANGModel +from lighteval.models.sglang.sglang_model import SGLangModel, SGLangModelConfig from lighteval.models.transformers.adapter_model import AdapterModel, AdapterModelConfig from lighteval.models.transformers.delta_model import DeltaModel, DeltaModelConfig from lighteval.models.transformers.transformers_model import TransformersModel, TransformersModelConfig from lighteval.models.vllm.vllm_model import VLLMModel, VLLMModelConfig from lighteval.utils.imports import ( NO_LITELLM_ERROR_MSG, + NO_SGLANG_ERROR_MSG, NO_TGI_ERROR_MSG, NO_VLLM_ERROR_MSG, is_litellm_available, is_openai_available, + is_sglang_available, is_tgi_available, - is_vllm_available, is_sglang_available, NO_SGLANG_ERROR_MSG, + is_vllm_available, ) from lighteval.utils.utils import EnvConfig @@ -63,7 +65,7 @@ def load_model( # noqa: C901 VLLMModelConfig, OpenAIModelConfig, LiteLLMModelConfig, - SGLANGModelConfig, + SGLangModelConfig, ], env_config: EnvConfig, ) -> Union[TransformersModel, AdapterModel, DeltaModel, ModelClient, DummyModel]: @@ -98,9 +100,7 @@ def load_model( # noqa: C901 if isinstance(config, VLLMModelConfig): return load_model_with_accelerate_or_default(config=config, env_config=env_config) - if isinstance(config, SGLANGModelConfig): - # TODO: double check - # return load_model_with_accelerate_or_default(config=config, env_config=env_config) + if isinstance(config, SGLangModelConfig): return load_sglang_model(config=config, env_config=env_config) if isinstance(config, OpenAIModelConfig): @@ -167,8 +167,9 @@ def load_model_with_accelerate_or_default( def load_dummy_model(config: DummyModelConfig, env_config: EnvConfig): return DummyModel(config=config, env_config=env_config) -def load_sglang_model(config: SGLANGModelConfig, env_config: EnvConfig): + +def load_sglang_model(config: SGLangModelConfig, env_config: EnvConfig): if not is_sglang_available(): raise ImportError(NO_SGLANG_ERROR_MSG) - return SGLANGModel(config=config, env_config=env_config) + return SGLangModel(config=config, env_config=env_config) diff --git a/src/lighteval/models/sglang/sglang_model.py b/src/lighteval/models/sglang/sglang_model.py index 52cb2ec29..9bb8af7bd 100644 --- a/src/lighteval/models/sglang/sglang_model.py +++ b/src/lighteval/models/sglang/sglang_model.py @@ -1,6 +1,6 @@ # MIT License -# Copyright (c) 2024 The HuggingFace Team +# Copyright (c) 2024 The SGLang Team # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -22,9 +22,6 @@ import gc import logging -import os -import subprocess -import signal from dataclasses import dataclass from typing import Optional @@ -46,25 +43,22 @@ from lighteval.utils.imports import is_sglang_available from lighteval.utils.utils import EnvConfig, as_list + logger = logging.getLogger(__name__) if is_sglang_available(): from sglang import Engine from sglang.srt.hf_transformers_utils import get_tokenizer - from sglang.srt.distributed.parallel_state import destroy_model_parallel, destroy_distributed_environment - + logging.getLogger("sglang").propagate = True logging.getLogger("sglang").handlers.clear() else: Engine = None get_tokenizer = None -os.environ["TOKENIZERS_PARALLELISM"] = "false" - -STARTING_BATCH_SIZE = 512 @dataclass -class SGLANGModelConfig: +class SGLangModelConfig: pretrained: str load_format: str = "auto" dtype: str = "auto" @@ -78,18 +72,22 @@ class SGLANGModelConfig: skip_tokenizer_init: bool = False kv_cache_dtype: str = "auto" add_special_tokens: bool = True - pipeline_parallel_size: int = 1 # how many GPUs to use for pipeline parallelism - + pairwise_tokenization: bool = False + sampling_backend: str | None = None + attention_backend: str = None + mem_fraction_static: float = 0.8 + chunked_prefill_size: int = 4096 generation_parameters: GenerationParameters = None - + def __post_init__(self): if not self.generation_parameters: self.generation_parameters = GenerationParameters() -class SGLANGModel(LightevalModel): + +class SGLangModel(LightevalModel): def __init__( self, - config: SGLANGModelConfig, + config: SGLangModelConfig, env_config: EnvConfig, ): """Initializes a HuggingFace `AutoModel` and `AutoTokenizer` for evaluation.""" @@ -106,19 +104,20 @@ def __init__( self.precision = _get_dtype(config.dtype, config=self._config) self.sampling_params = config.generation_parameters.to_sglang_dict() self.model_info = ModelInfo(model_name=self.model_name, model_sha=self.model_sha) - + self.sampling_backend = config.sampling_backend + self.attention_backend = config.attention_backend + self.pairwise_tokenization = config.pairwise_tokenization + @property def tokenizer(self): return self._tokenizer - def cleanup(self): - destroy_model_parallel() + def cleanup(self): if self.model is not None: self.model.shutdown() self.model = None gc.collect() - destroy_distributed_environment() torch.cuda.empty_cache() @property @@ -129,33 +128,33 @@ def add_special_tokens(self): def max_length(self) -> int: return self._max_length - def _create_auto_model(self, config: SGLANGModelConfig, env_config: EnvConfig) -> Optional[Engine]: - - # TODO: double check - self.model_args = { + def _create_auto_model(self, config: SGLangModelConfig, env_config: EnvConfig) -> Optional[Engine]: + self.model_args = { "model_path": config.pretrained, "trust_remote_code": config.trust_remote_code, "dtype": config.dtype, "device": "cuda", "random_seed": config.random_seed, "load_format": config.load_format, - "context_length": int(self._max_length) if self._max_length else None, + "context_length": int(self._max_length) if self._max_length else 8192, "dp_size": int(config.dp_size), "tp_size": int(config.tp_size), - "log_level": "info", + "sampling_backend": config.sampling_backend, + "attention_backend": config.attention_backend, + "mem_fraction_static": float(config.mem_fraction_static), + "schedule_policy": "fcfs", + "chunked_prefill_size": int(config.chunked_prefill_size), + "disable_radix_cache": True, } - if config.dp_size > 1: - pass - model = Engine(**self.model_args) if self._max_length is None: - self._max_length = 8192 + self._max_length = 8192 return model - def _create_auto_tokenizer(self, config: SGLANGModelConfig, env_config: EnvConfig): + def _create_auto_tokenizer(self, config: SGLangModelConfig, env_config: EnvConfig): tokenizer = get_tokenizer( config.pretrained, tokenizer_mode="auto", @@ -194,14 +193,12 @@ def greedy_until( position=0, disable=False, ): - if self.use_chat_template: stop_tokens = [] else: stop_tokens = dataset[0].stop_sequence max_new_tokens = dataset[0].generation_size # could be none - returns_logits = dataset[0].use_logits num_samples = dataset[0].num_samples context = [c.context for c in dataset] @@ -212,7 +209,7 @@ def greedy_until( # of losing some meaning, or have some generations that are exceedingly short? # The choice we go for here is to avoid truncating the prompt if we can, since it # should have been managed by the prompt creator/few shot manager if requested by the user. - + inputs = tokenized["input_ids"] context_size = len(inputs[0]) @@ -238,7 +235,7 @@ def greedy_until( stop_tokens=stop_tokens, num_samples=num_samples, ) - + for input_token_ids, sglang_output in zip(inputs, sglang_outputs): meta_info = sglang_output["meta_info"] output_token_logprobs = meta_info["output_token_logprobs"] @@ -252,7 +249,6 @@ def greedy_until( input_tokens=input_token_ids, ) results.append(cur_response) - return dataset.get_original_order(results) def _generate( @@ -264,29 +260,77 @@ def _generate( generate: bool = True, ) -> list[GenerativeResponse]: """Contains the actual logic of the generation.""" - # TODO: double check - self.sampling_params["stop"] = stop_tokens - self.sampling_params["n"] = num_samples - self.sampling_params["top_p"] = 1.0 - self.sampling_params["top_k"] = -1 - self.sampling_params["skip_special_tokens"] = True - self.sampling_params["temperature"] = 0 + logprob_start_len = None + top_logprobs_num = None if generate: self.sampling_params["max_new_tokens"] = max_new_tokens + self.sampling_params["stop"] = stop_tokens + self.sampling_params["n"] = num_samples else: self.sampling_params["max_new_tokens"] = 1 + self.sampling_params["temperature"] = 0 + logprob_start_len = 0 + top_logprobs_num = 1 + outputs = self.model.generate( - input_ids=inputs, - sampling_params=self.sampling_params, - return_logprob=True, - ) + input_ids=inputs, + sampling_params=self.sampling_params, + return_logprob=True, + logprob_start_len=logprob_start_len, + top_logprobs_num=top_logprobs_num, + ) return outputs def loglikelihood( self, requests: list[LoglikelihoodRequest], override_bs: Optional[int] = None ) -> list[LoglikelihoodResponse]: - pass + for request in requests: + if request.context == "": + request.tokenized_context = [self.tokenizer.eos_token_id] + request.tokenized_continuation = self.tok_encode(request.choice) + else: + # The following line is mandatory for compatibility with the harness + request.tokenized_context, request.tokenized_continuation = self.tok_encode_pair( + request.context, request.choice, pairwise=self.pairwise_tokenization + ) + + return self._loglikelihood_tokens(requests, override_bs=override_bs) + + def _loglikelihood_tokens( + self, + requests: list[LoglikelihoodRequest], + override_bs: int = -1, + return_bool_score: bool = True, + rolling: bool = False, + ) -> list[LoglikelihoodResponse]: + dataset = LoglikelihoodDataset(requests=requests, num_dataset_splits=1) + res = [] + + for _ in tqdm(dataset.splits_start_end_iterator(), disable=False): + # the last token is an eos token, so we don't need to add it + inputs = [dataset[i].tokenized_context + dataset[i].tokenized_continuation for i in range(len(dataset))] + # Left truncate the inputs to the maximum length + inputs = [input[-self.max_length :] for input in inputs] + outputs = self._generate(inputs, generate=False) + + for output, input in zip(outputs, dataset): + continuation_logprobs = [] + meta_info = output["meta_info"] + input_token_logprobs = meta_info["input_token_logprobs"][::-1] + input_top_logprobs = meta_info["input_top_logprobs"][::-1] + input_top_logprobs = input_top_logprobs[: len(input.tokenized_continuation)] + continuation_logprobs.append(input_token_logprobs[: len(input.tokenized_continuation)]) + bool_score = all( + top[0][1] == input[1] for top, input in zip(input_top_logprobs, continuation_logprobs[0]) + ) + answer = LoglikelihoodResponse( + input_tokens=input.tokenized_context + input.tokenized_continuation, + generated_tokens=input.tokenized_continuation, + result=(sum(item[0] for item in continuation_logprobs[0]), bool_score), + ) + res.append(answer) + return dataset.get_original_order(res) def loglikelihood_rolling(): pass diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index 247e477d1..0bb13a6f1 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -48,6 +48,7 @@ logger = logging.getLogger(__name__) + if is_vllm_available(): import ray from more_itertools import distribute @@ -71,6 +72,7 @@ STARTING_BATCH_SIZE = 512 + @dataclass class VLLMModelConfig: pretrained: str @@ -129,7 +131,7 @@ def __init__( self.model_info = ModelInfo(model_name=self.model_name, model_sha=self.model_sha) self.sampling_params = SamplingParams(**config.generation_parameters.to_vllm_openai_dict()) self.pairwise_tokenization = config.pairwise_tokenization - + @property def tokenizer(self): return self._tokenizer @@ -275,7 +277,7 @@ def greedy_until( ) context_size = self.max_length inputs = [input[-context_size:] for input in inputs] - + vllm_outputs = self._generate( inputs=inputs, max_new_tokens=max_new_tokens, @@ -283,13 +285,14 @@ def greedy_until( returns_logits=returns_logits, num_samples=num_samples, ) - + for vllm_output in vllm_outputs: output_token_ids = [outputs.token_ids for outputs in vllm_output.outputs] logprobs = [output.logprobs for output in vllm_output.outputs] or [] logprobs = [logprob[token_id].logprob for token_id, logprob in zip(output_token_ids[0], logprobs[0])] result = [output.text for output in vllm_output.outputs] input_token_ids = vllm_output.prompt_token_ids + cur_response = GenerativeResponse( result=result, logits=logprobs, @@ -356,6 +359,7 @@ def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, r sampling_params=sampling_params, use_tqdm=True, ) + return outputs def loglikelihood( diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py index d71d6991a..649bdcf8b 100644 --- a/src/lighteval/pipeline.py +++ b/src/lighteval/pipeline.py @@ -51,13 +51,15 @@ NO_ACCELERATE_ERROR_MSG, NO_NANOTRON_ERROR_MSG, NO_OPENAI_ERROR_MSG, + NO_SGLANG_ERROR_MSG, NO_TGI_ERROR_MSG, NO_VLLM_ERROR_MSG, is_accelerate_available, is_nanotron_available, is_openai_available, + is_sglang_available, is_tgi_available, - is_vllm_available, is_sglang_available, NO_SGLANG_ERROR_MSG, + is_vllm_available, ) from lighteval.utils.parallelism import test_all_gather from lighteval.utils.utils import EnvConfig, make_results_table @@ -151,6 +153,7 @@ def __init__( self.evaluation_tracker = evaluation_tracker self.accelerator, self.parallel_context = self._init_parallelism_manager() self.model = self._init_model(model_config, model) + self.evaluation_tracker.general_config_logger.log_model_info(self.model.model_info) self._init_tasks_and_requests(tasks=tasks) self._init_random_seeds() @@ -193,7 +196,6 @@ def _init_model(self, model_config, model): ) else: return load_model(config=model_config, env_config=self.pipeline_parameters.env_config) - if isinstance(model, TransformersModel): return model else: @@ -211,10 +213,10 @@ def _init_tasks_and_requests(self, tasks: str): cache_dir=self.pipeline_parameters.env_config.cache_dir, custom_tasks=self.pipeline_parameters.custom_tasks_directory, ) - task_names_list, fewshots_dict = taskinfo_selector(tasks, registry) task_dict = registry.get_task_dict(task_names_list) LightevalTask.load_datasets(list(task_dict.values()), self.pipeline_parameters.dataset_loading_processes) + self.evaluation_tracker.task_config_logger.log(task_dict) requests, docs = create_requests_from_tasks( @@ -449,7 +451,6 @@ def _run_model(self): responses = run_model(requests, override_bs=self.pipeline_parameters.override_batch_size) # Storing the responses associated to the same samples together - for response, request in zip(responses, requests): for metric_category in request.metric_categories: sample_id = SampleUid(request.task_name, request.sample_index) diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index 71a8c2bb1..174a98d33 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -299,7 +299,6 @@ def taskinfo_selector(tasks: str, task_registry: Registry) -> tuple[list[str], d expanded_tasks = task_registry.task_groups_dict.get(maybe_task_group, [maybe_task_group]) expanded_tasks_list.extend(expanded_tasks) - ## task expand and few shot number record, not load task for task in expanded_tasks_list: try: suite_name, task_name, few_shot, truncate_few_shots = tuple(task.split("|")) diff --git a/src/lighteval/utils/imports.py b/src/lighteval/utils/imports.py index 14e3b94da..5a007c95f 100644 --- a/src/lighteval/utils/imports.py +++ b/src/lighteval/utils/imports.py @@ -87,14 +87,17 @@ def is_litellm_available() -> bool: def is_vllm_available() -> bool: return importlib.util.find_spec("vllm") is not None and importlib.util.find_spec("ray") is not None + NO_VLLM_ERROR_MSG = "You are trying to use an VLLM model, for which you need `vllm` and `ray`, which are not available in your environment. Please install them using pip, `pip install vllm ray`." -# TODO: need review + def is_sglang_available() -> bool: return importlib.util.find_spec("sglang") is not None and importlib.util.find_spec("flashinfer") is not None + NO_SGLANG_ERROR_MSG = "You are trying to use an sglang model, for which you need `sglang` and `flashinfer`, which are not available in your environment. Please install them using pip, `pip install vllm ray`." + def can_load_extended_tasks() -> bool: imports = [] for package in ["langdetect", "openai"]: From 235836012db1f3a9677dc460b8da68ff30ba9f0f Mon Sep 17 00:00:00 2001 From: Qiujiang Chen <12012211@mail.sustech.edu.cn> Date: Sat, 15 Feb 2025 19:33:44 +0800 Subject: [PATCH 09/10] Update use-sglang-as-backend.mdx --- docs/source/use-sglang-as-backend.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/use-sglang-as-backend.mdx b/docs/source/use-sglang-as-backend.mdx index 2edcaeeb5..39d486f1e 100644 --- a/docs/source/use-sglang-as-backend.mdx +++ b/docs/source/use-sglang-as-backend.mdx @@ -13,7 +13,7 @@ lighteval sglang \ parallelism and tensor parallelism. You can choose the parallelism method by setting in the the `model_args`. -For example if you have 4 GPUs you can split it across using `tensor_parallelism`: +For example if you have 4 GPUs you can split it across using `tp_size`: ```bash lighteval sglang \ @@ -21,7 +21,7 @@ lighteval sglang \ "leaderboard|truthfulqa:mc|0|0" ``` -Or, if your model fits on a single GPU, you can use `data_parallelism` to speed up the evaluation: +Or, if your model fits on a single GPU, you can use `dp_size` to speed up the evaluation: ```bash lighteval sglang \ From d8841a9be46ef661a65ade98f563de9ea0989000 Mon Sep 17 00:00:00 2001 From: Jayon02 <12012211@mail.sustech.edu.cn> Date: Tue, 18 Feb 2025 02:32:42 +0000 Subject: [PATCH 10/10] Squashed commit of the following: commit 1035aa5e4564669c2b094d977c4a124b2d1fe3d6 Author: Jayon02 <12012211@mail.sustech.edu.cn> Date: Tue Feb 18 01:31:21 2025 +0000 modify document and fix bug commit be58c7c259ad75ae1a83c8d95ef237e8ecee63aa Author: Jayon02 <12012211@mail.sustech.edu.cn> Date: Mon Feb 17 14:35:03 2025 +0000 modify toml commit 86e41c98d7430ce837db9d2b24155c2522c11e3d Merge: 132290b 50f3695 Author: Jayon02 <12012211@mail.sustech.edu.cn> Date: Sun Feb 16 01:30:17 2025 +0000 Merge branch 'main' into sglang commit 132290b571b7470ffec04461ce504eb352b031a3 Author: Jayon02 <12012211@mail.sustech.edu.cn> Date: Sat Feb 15 11:08:24 2025 +0000 modify document commit 601a75504be8320703af401eaddce2ceef40f3ca Author: Jayon02 <12012211@mail.sustech.edu.cn> Date: Sat Feb 15 10:22:43 2025 +0000 pass pre commit check and modify document commit 3e1fb8899c073622f76c14b7a12c392cfe426a37 Author: qiujiang chen Date: Sat Feb 15 06:59:12 2025 +0000 optimize input, adjust precision commit 1a590760aa5dd188d2e31e3f6ee316433004614d Author: qiujiang chen Date: Thu Feb 13 19:51:22 2025 +0000 text files commit 9dc62b7f55a190e42cd46016a12358c1f1b78c93 Author: qiujiang chen Date: Wed Feb 12 14:08:21 2025 +0000 modify format --- docs/source/use-sglang-as-backend.mdx | 43 +++++++++++-------- .../model_configs/sglang_model_config.yaml | 13 ++++++ src/lighteval/main_sglang.py | 22 +++------- src/lighteval/models/model_input.py | 1 - src/lighteval/models/sglang/sglang_model.py | 3 +- 5 files changed, 45 insertions(+), 37 deletions(-) create mode 100644 examples/model_configs/sglang_model_config.yaml diff --git a/docs/source/use-sglang-as-backend.mdx b/docs/source/use-sglang-as-backend.mdx index 39d486f1e..595e4cfb9 100644 --- a/docs/source/use-sglang-as-backend.mdx +++ b/docs/source/use-sglang-as-backend.mdx @@ -29,23 +29,32 @@ lighteval sglang \ "leaderboard|truthfulqa:mc|0|0" ``` -Available arguments for `sglang` can be found in the `SGLangModelConfig`: - -- **pretrained** (str): HuggingFace Hub model ID name or the path to a pre-trained model to load. -- **load_format** (str): The format the weights are loaded in. Defaults to *.safetensors/*.bin. -- **dtype** (str): Dtype used for the model, defaults to bfloat16. -- **tp_size** (int): The number of GPUs the model weights get sharded over. -- **dp_size** (int): The number of data-parallel copies of the model. -- **context_length** (int | None): The number of tokens our model can process including the input. -- **random_seed** (int): Can be used to enforce more deterministic behavior. -- **trust_remote_code** (bool): If True, will use locally cached config files, otherwise use remote configs in HuggingFace. -- **skip_tokenizer_init** (bool): Set to true to provide the tokens to the engine and get the output tokens directly, typically used in RLHF. -- **kv_cache_dtype** (str): Dtype of the kv cache, defaults to the auto. -- **add_special_tokens** (bool): Whether to add special tokens to the input sequences. -- **sampling_backend** (str | None): The backend for sampling. -- **attention_backend** (str | None): The backend for attention computation and KV cache management. -- **mem_fraction_static** (float): Fraction of the free GPU memory used for static memory like model weights and KV cache. -- **chunked_prefill_size** (int): Perform the prefill in chunks of these size. +## Use a config file + +For more advanced configurations, you can use a config file for the model. +An example of a config file is shown below and can be found at `examples/model_configs/sglang_model_config.yaml`. + +```bash +lighteval sglang \ + "examples/model_configs/sglang_model_config.yaml" \ + "leaderboard|truthfulqa:mc|0|0" +``` + +```yaml +model: # Model specific parameters + base_params: + model_args: "pretrained=HuggingFaceTB/SmolLM-1.7B,dtype=float16,chunked_prefill_size=4096,mem_fraction_static=0.9" # Model args that you would pass in the command line + generation: # Generation specific parameters + temperature: 0.3 + repetition_penalty: 1.0 + frequency_penalty: 0.0 + presence_penalty: 0.0 + top_k: -1 + min_p: 0.0 + top_p: 0.9 + max_new_tokens: 256 + stop_tokens: ["", ""] +``` > [!WARNING] > In the case of OOM issues, you might need to reduce the context size of the diff --git a/examples/model_configs/sglang_model_config.yaml b/examples/model_configs/sglang_model_config.yaml new file mode 100644 index 000000000..2a980e3a8 --- /dev/null +++ b/examples/model_configs/sglang_model_config.yaml @@ -0,0 +1,13 @@ +model: + base_params: + model_args: "pretrained=HuggingFaceTB/SmolLM-1.7B,dtype=float16,chunked_prefill_size=4096,mem_fraction_static=0.9" + generation: + temperature: 0.3 + repetition_penalty: 1.0 + frequency_penalty: 0.0 + presence_penalty: 0.0 + top_k: -1 + min_p: 0.0 + top_p: 0.9 + max_new_tokens: 256 + stop_tokens: ["", ""] diff --git a/src/lighteval/main_sglang.py b/src/lighteval/main_sglang.py index a49840abe..b20cde512 100644 --- a/src/lighteval/main_sglang.py +++ b/src/lighteval/main_sglang.py @@ -19,9 +19,7 @@ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -import json import os -import re from typing import Optional from typer import Argument, Option @@ -135,23 +133,13 @@ def sglang( if model_args.endswith(".yaml"): with open(model_args, "r") as f: config = yaml.safe_load(f)["model"] + model_args = config["base_params"]["model_args"] generation_parameters = GenerationParameters.from_dict(config) - model_config = SGLangModelConfig(config, generation_parameters=generation_parameters) - else: - pattern = re.compile(r"(\w+)=(\{.*\}|[^,]+)") - matches = pattern.findall(model_args) - model_args_dict = {} - generation_params = None - for key, value in matches: - key = key.strip() - if key == "generation_parameters": - value = re.sub(r"(\w+):", r'"\1":', value) - value = json.loads(value) - generation_params = GenerationParameters(**value) - else: - model_args_dict[key] = value - model_config = SGLangModelConfig(**model_args_dict, generation_parameters=generation_params) + generation_parameters = GenerationParameters() + + model_args_dict: dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in model_args.split(",")} + model_config = SGLangModelConfig(**model_args_dict, generation_parameters=generation_parameters) pipeline = Pipeline( tasks=tasks, diff --git a/src/lighteval/models/model_input.py b/src/lighteval/models/model_input.py index 56b9f379f..ebbf46f22 100644 --- a/src/lighteval/models/model_input.py +++ b/src/lighteval/models/model_input.py @@ -158,7 +158,6 @@ def to_tgi_ie_dict(self) -> dict: def to_sglang_dict(self) -> dict: args = { "max_new_tokens": self.max_new_tokens, - "stop_token_ids": self.stop_tokens, "temperature": self.temperature, "stop": self.stop_tokens, "top_p": self.top_p, diff --git a/src/lighteval/models/sglang/sglang_model.py b/src/lighteval/models/sglang/sglang_model.py index 9bb8af7bd..08346f325 100644 --- a/src/lighteval/models/sglang/sglang_model.py +++ b/src/lighteval/models/sglang/sglang_model.py @@ -136,7 +136,7 @@ def _create_auto_model(self, config: SGLangModelConfig, env_config: EnvConfig) - "device": "cuda", "random_seed": config.random_seed, "load_format": config.load_format, - "context_length": int(self._max_length) if self._max_length else 8192, + "context_length": self._max_length, "dp_size": int(config.dp_size), "tp_size": int(config.tp_size), "sampling_backend": config.sampling_backend, @@ -146,7 +146,6 @@ def _create_auto_model(self, config: SGLangModelConfig, env_config: EnvConfig) - "chunked_prefill_size": int(config.chunked_prefill_size), "disable_radix_cache": True, } - model = Engine(**self.model_args) if self._max_length is None: