diff --git a/csharp/tools/MauiModelTester/create_test_data.py b/csharp/tools/MauiModelTester/create_test_data.py index 6c57c71f94216..d73fd950a7bc0 100644 --- a/csharp/tools/MauiModelTester/create_test_data.py +++ b/csharp/tools/MauiModelTester/create_test_data.py @@ -2,7 +2,6 @@ import shutil import sys from pathlib import Path -from typing import Dict, List, Optional import numpy as np @@ -84,7 +83,7 @@ def parse_args(): return args -def create_existing_data_map(pb_files: List[Path]): +def create_existing_data_map(pb_files: list[Path]): import onnx_test_data_utils as data_utils data_map = {} @@ -98,9 +97,9 @@ def create_existing_data_map(pb_files: List[Path]): def add_model_and_test_data_to_app( model_path: Path, - symbolic_dims: Optional[Dict[str, int]] = None, - input_map: Optional[Dict[str, np.ndarray]] = None, - output_map: Optional[Dict[str, np.ndarray]] = None, + symbolic_dims: dict[str, int] | None = None, + input_map: dict[str, np.ndarray] | None = None, + output_map: dict[str, np.ndarray] | None = None, ): import ort_test_dir_utils as utils diff --git a/onnxruntime/python/backend/backend_rep.py b/onnxruntime/python/backend/backend_rep.py index af785b71c5f55..a30569d004d34 100644 --- a/onnxruntime/python/backend/backend_rep.py +++ b/onnxruntime/python/backend/backend_rep.py @@ -6,8 +6,6 @@ Implements ONNX's backend API. """ -from typing import Any, Tuple # noqa: F401 - from onnx.backend.base import BackendRep from onnxruntime import RunOptions diff --git a/onnxruntime/python/onnxruntime_inference_collection.py b/onnxruntime/python/onnxruntime_inference_collection.py index c12efc7fdfc9b..a3741abc48077 100644 --- a/onnxruntime/python/onnxruntime_inference_collection.py +++ b/onnxruntime/python/onnxruntime_inference_collection.py @@ -9,7 +9,8 @@ import os import typing import warnings -from typing import Any, Sequence +from collections.abc import Sequence +from typing import Any from onnxruntime.capi import _pybind_state as C @@ -143,7 +144,7 @@ def set_provider_options(name, options): if not all([isinstance(options_for_provider, dict) for options_for_provider in provider_options]): raise ValueError("'provider_options' values must be dicts.") - for name, options in zip(providers, provider_options): + for name, options in zip(providers, provider_options, strict=False): set_provider_options(name, options) else: diff --git a/onnxruntime/python/tools/custom_op_wrapper/create_custom_op_wrapper.py b/onnxruntime/python/tools/custom_op_wrapper/create_custom_op_wrapper.py index e0967ef5545db..76238b982fd96 100644 --- a/onnxruntime/python/tools/custom_op_wrapper/create_custom_op_wrapper.py +++ b/onnxruntime/python/tools/custom_op_wrapper/create_custom_op_wrapper.py @@ -22,7 +22,6 @@ import os import sys from dataclasses import dataclass -from typing import List, Optional, Union import onnx from onnx import TensorProto, helper @@ -65,7 +64,7 @@ class IOInfo: index: int name: str elem_type: TensorProto.DataType - shape: Optional[List[Union[int, str]]] + shape: list[int | str] | None def str_is_int(string: str) -> bool: @@ -76,7 +75,7 @@ def str_is_int(string: str) -> bool: return False -def parse_shape(shape_str: str) -> Optional[List[Union[int, str]]]: +def parse_shape(shape_str: str) -> list[int | str] | None: try: shape = [int(s) if str_is_int(s) else s for s in shape_str.split(",")] except ValueError: @@ -204,7 +203,7 @@ def parse_arguments() -> argparse.Namespace: return parser.parse_args() -def get_attributes(attr_data_info: List[List[str]]): +def get_attributes(attr_data_info: list[list[str]]): if not attr_data_info: return {} diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/kernel_explorer.py b/onnxruntime/python/tools/kernel_explorer/kernels/kernel_explorer.py index 66e1a8052ce84..363eb3865e699 100644 --- a/onnxruntime/python/tools/kernel_explorer/kernels/kernel_explorer.py +++ b/onnxruntime/python/tools/kernel_explorer/kernels/kernel_explorer.py @@ -13,11 +13,11 @@ import sys from abc import abstractmethod from argparse import Action, ArgumentParser +from collections.abc import Callable from contextlib import contextmanager from dataclasses import dataclass from fnmatch import fnmatch from functools import wraps -from typing import Callable build_dir = os.environ.get("KERNEL_EXPLORER_BUILD_DIR", None) if build_dir is None: diff --git a/onnxruntime/python/tools/offline_tuning.py b/onnxruntime/python/tools/offline_tuning.py index c032685b70f7c..c55b515814a28 100644 --- a/onnxruntime/python/tools/offline_tuning.py +++ b/onnxruntime/python/tools/offline_tuning.py @@ -7,11 +7,11 @@ import sys from collections import OrderedDict from pprint import pprint -from typing import Any, Dict, List +from typing import Any import onnx -TuningResults = Dict[str, Any] +TuningResults = dict[str, Any] _TUNING_RESULTS_KEY = "tuning_results" @@ -32,7 +32,7 @@ def extract(model: onnx.ModelProto): return json.loads(tuning_results_prop.value) -def embed(model: onnx.ModelProto, tuning_results: List[TuningResults], overwrite=False): +def embed(model: onnx.ModelProto, tuning_results: list[TuningResults], overwrite=False): idx = _find_tuning_results_in_props(model.metadata_props) assert overwrite or idx <= 0, "the supplied onnx file already have tuning results embedded!" @@ -47,7 +47,7 @@ def embed(model: onnx.ModelProto, tuning_results: List[TuningResults], overwrite class Merger: class EpAndValidators: - def __init__(self, ep: str, validators: Dict[str, str]): + def __init__(self, ep: str, validators: dict[str, str]): self.ep = ep self.validators = copy.deepcopy(validators) self.key = (ep, tuple(sorted(validators.items()))) @@ -61,7 +61,7 @@ def __eq__(self, other): def __init__(self): self.ev_to_results = OrderedDict() - def merge(self, tuning_results: List[TuningResults]): + def merge(self, tuning_results: list[TuningResults]): for trs in tuning_results: self._merge_one(trs) diff --git a/onnxruntime/python/tools/quantization/base_quantizer.py b/onnxruntime/python/tools/quantization/base_quantizer.py index 0cd186bffdea0..ac11607e02710 100644 --- a/onnxruntime/python/tools/quantization/base_quantizer.py +++ b/onnxruntime/python/tools/quantization/base_quantizer.py @@ -4,7 +4,7 @@ # license information. # -------------------------------------------------------------------------- import logging -from typing import Any, Dict +from typing import Any import numpy as np import onnx @@ -36,7 +36,7 @@ class QuantizationParams: - def __init__(self, **data: Dict[str, Any]): + def __init__(self, **data: dict[str, Any]): self.data = {} for k, v in data.items(): if not isinstance(k, str): diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py index 7855f260a551a..f3bb533ac89e8 100644 --- a/onnxruntime/python/tools/quantization/calibrate.py +++ b/onnxruntime/python/tools/quantization/calibrate.py @@ -9,9 +9,9 @@ import itertools import os import uuid +from collections.abc import Sequence from enum import Enum from pathlib import Path -from typing import Dict, Optional, Sequence, Tuple, Union import numpy as np import onnx @@ -39,7 +39,7 @@ def rel_entr(pk: np.ndarray, qk: np.ndarray) -> np.ndarray: def entropy( pk: np.ndarray, qk: np.ndarray, - base: Optional[float] = None, + base: float | None = None, axis: int = 0, ) -> np.ndarray: """ @@ -100,7 +100,7 @@ def to_dict(self): class TensorsData: - def __init__(self, calibration_method, data: Dict[str, Union[TensorData, Tuple]]): + def __init__(self, calibration_method, data: dict[str, TensorData | tuple]): self.calibration_method = calibration_method self.data = {} for k, v in data.items(): @@ -187,8 +187,8 @@ def set_range(self, start_index: int, end_index: int): class CalibraterBase: def __init__( self, - model_path: Union[str, Path], - op_types_to_calibrate: Optional[Sequence[str]] = None, + model_path: str | Path, + op_types_to_calibrate: Sequence[str] | None = None, augmented_model_path="augmented_model.onnx", symmetric=False, use_external_data_format=False, @@ -297,8 +297,8 @@ def compute_data(self) -> TensorsData: class MinMaxCalibrater(CalibraterBase): def __init__( self, - model_path: Union[str, Path], - op_types_to_calibrate: Optional[Sequence[str]] = None, + model_path: str | Path, + op_types_to_calibrate: Sequence[str] | None = None, augmented_model_path="augmented_model.onnx", symmetric=False, use_external_data_format=False, @@ -476,7 +476,8 @@ def compute_data(self) -> TensorsData: output_names = [self.infer_session.get_outputs()[i].name for i in range(len(self.intermediate_outputs[0]))] output_dicts_list = [ - dict(zip(output_names, intermediate_output)) for intermediate_output in self.intermediate_outputs + dict(zip(output_names, intermediate_output, strict=False)) + for intermediate_output in self.intermediate_outputs ] merged_output_dict = {} @@ -507,7 +508,9 @@ def compute_data(self) -> TensorsData: else: pairs.append(tuple([min_value_array, max_value_array])) - new_calibrate_tensors_range = TensorsData(CalibrationMethod.MinMax, dict(zip(calibrate_tensor_names, pairs))) + new_calibrate_tensors_range = TensorsData( + CalibrationMethod.MinMax, dict(zip(calibrate_tensor_names, pairs, strict=False)) + ) if self.calibrate_tensors_range: self.calibrate_tensors_range = self.merge_range(self.calibrate_tensors_range, new_calibrate_tensors_range) else: @@ -519,8 +522,8 @@ def compute_data(self) -> TensorsData: class HistogramCalibrater(CalibraterBase): def __init__( self, - model_path: Union[str, Path], - op_types_to_calibrate: Optional[Sequence[str]] = None, + model_path: str | Path, + op_types_to_calibrate: Sequence[str] | None = None, augmented_model_path="augmented_model.onnx", use_external_data_format=False, method="percentile", @@ -608,7 +611,8 @@ def collect_data(self, data_reader: CalibrationDataReader): raise ValueError("No data is collected.") output_dicts_list = [ - dict(zip(output_names, intermediate_output)) for intermediate_output in self.intermediate_outputs + dict(zip(output_names, intermediate_output, strict=False)) + for intermediate_output in self.intermediate_outputs ] merged_dict = {} @@ -653,8 +657,8 @@ def compute_data(self) -> TensorsData: class EntropyCalibrater(HistogramCalibrater): def __init__( self, - model_path: Union[str, Path], - op_types_to_calibrate: Optional[Sequence[str]] = None, + model_path: str | Path, + op_types_to_calibrate: Sequence[str] | None = None, augmented_model_path="augmented_model.onnx", use_external_data_format=False, method="entropy", @@ -687,8 +691,8 @@ def __init__( class PercentileCalibrater(HistogramCalibrater): def __init__( self, - model_path: Union[str, Path], - op_types_to_calibrate: Optional[Sequence[str]] = None, + model_path: str | Path, + op_types_to_calibrate: Sequence[str] | None = None, augmented_model_path="augmented_model.onnx", use_external_data_format=False, method="percentile", @@ -721,8 +725,8 @@ def __init__( class DistributionCalibrater(HistogramCalibrater): def __init__( self, - model_path: Union[str, Path], - op_types_to_calibrate: Optional[Sequence[str]] = None, + model_path: str | Path, + op_types_to_calibrate: Sequence[str] | None = None, augmented_model_path="augmented_model.onnx", use_external_data_format=False, method="distribution", @@ -1168,8 +1172,8 @@ def get_entropy_threshold(self, histogram, num_quantized_bins): def create_calibrator( - model: Union[str, Path], - op_types_to_calibrate: Optional[Sequence[str]] = None, + model: str | Path, + op_types_to_calibrate: Sequence[str] | None = None, augmented_model_path="augmented_model.onnx", calibrate_method=CalibrationMethod.MinMax, use_external_data_format=False, diff --git a/onnxruntime/python/tools/quantization/matmul_bnb4_quantizer.py b/onnxruntime/python/tools/quantization/matmul_bnb4_quantizer.py index 2bf47fe1680e9..2e8ee11e2f864 100644 --- a/onnxruntime/python/tools/quantization/matmul_bnb4_quantizer.py +++ b/onnxruntime/python/tools/quantization/matmul_bnb4_quantizer.py @@ -7,7 +7,6 @@ import argparse import logging import os -from typing import List, Tuple import numpy as np import numpy.typing as npt @@ -44,7 +43,7 @@ def __init__(self, model: ModelProto, quant_type: int, block_size: int, nodes_to self.nodes_to_exclude = set(nodes_to_exclude) @staticmethod - def __get_initializer(name, graph_path: List[GraphProto]) -> Tuple[TensorProto, GraphProto]: + def __get_initializer(name, graph_path: list[GraphProto]) -> tuple[TensorProto, GraphProto]: for gid in range(len(graph_path) - 1, -1, -1): graph = graph_path[gid] for tensor in graph.initializer: @@ -74,7 +73,7 @@ def bnb4_block_quant(self, fpweight: npt.ArrayLike) -> np.ndarray: return (packed, absmax) - def _bnb4_matmul_node_weight(self, node: NodeProto, graph_stack: List[GraphProto]) -> NodeProto: + def _bnb4_matmul_node_weight(self, node: NodeProto, graph_stack: list[GraphProto]) -> NodeProto: """If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node""" if node.op_type != "MatMul": @@ -129,7 +128,7 @@ def _bnb4_matmul_node_weight(self, node: NodeProto, graph_stack: List[GraphProto return matmul_bnb4_node - def _process_subgraph(self, graph_stack: List[GraphProto]): + def _process_subgraph(self, graph_stack: list[GraphProto]): new_nodes = [] graph = graph_stack[-1] diff --git a/onnxruntime/python/tools/quantization/qdq_loss_debug.py b/onnxruntime/python/tools/quantization/qdq_loss_debug.py index f9ed844febe46..9b545f2e94a2f 100644 --- a/onnxruntime/python/tools/quantization/qdq_loss_debug.py +++ b/onnxruntime/python/tools/quantization/qdq_loss_debug.py @@ -37,8 +37,8 @@ def get_next(self): import logging import math import time +from collections.abc import Callable, Sequence from pathlib import Path -from typing import Callable, Dict, List, Optional, Sequence, Union import numpy import onnx @@ -62,9 +62,9 @@ def get_next(self): def modify_model_output_intermediate_tensors( - input_model_path: Union[str, Path], - output_model_path: Union[str, Path], - op_types_for_saving: Optional[Sequence[str]] = None, + input_model_path: str | Path, + output_model_path: str | Path, + op_types_for_saving: Sequence[str] | None = None, save_as_external_data: bool = False, ) -> None: """Augment a given ONNX model to save node input/output tensors. @@ -116,8 +116,8 @@ def collect_activations( augmented_model: str, input_reader: CalibrationDataReader, session_options=None, - execution_providers: Optional[Sequence[str]] = None, -) -> Dict[str, List[numpy.ndarray]]: + execution_providers: Sequence[str] | None = None, +) -> dict[str, list[numpy.ndarray]]: """Run augmented model and collect activations tensors. Args: @@ -154,7 +154,7 @@ def collect_activations( output_dict = {} output_info = inference_session.get_outputs() for batch in intermediate_outputs: - for output, output_data in zip(output_info, batch): + for output, output_data in zip(output_info, batch, strict=False): if output.name.endswith(_TENSOR_SAVE_POSTFIX): output_name = output.name[:-_TENSOR_SAVE_POSTFIX_LEN] output_dict.setdefault(output_name, []).append(output_data) @@ -166,10 +166,10 @@ def collect_activations( def _add_pre_post_qdq_pair( - qdq_cmp: Dict[str, Dict[str, Sequence[numpy.ndarray]]], + qdq_cmp: dict[str, dict[str, Sequence[numpy.ndarray]]], activation_name: str, - pre_qdq_tensors: Optional[Sequence[numpy.ndarray]], - post_qdq_tensors: Optional[Sequence[numpy.ndarray]], + pre_qdq_tensors: Sequence[numpy.ndarray] | None, + post_qdq_tensors: Sequence[numpy.ndarray] | None, ) -> None: if post_qdq_tensors is not None and pre_qdq_tensors is not None: qdq_cmp[activation_name] = {} @@ -178,9 +178,9 @@ def _add_pre_post_qdq_pair( def create_activation_matching( - qdq_activations: Dict[str, Sequence[numpy.ndarray]], - float_activations: Optional[Dict[str, Sequence[numpy.ndarray]]] = None, -) -> Dict[str, Dict[str, Sequence[numpy.ndarray]]]: + qdq_activations: dict[str, Sequence[numpy.ndarray]], + float_activations: dict[str, Sequence[numpy.ndarray]] | None = None, +) -> dict[str, dict[str, Sequence[numpy.ndarray]]]: """Comparing activation values to help debugging accuracy loss due to quantization. This functions takes saved activations from the QDQ model and (optionally) the @@ -210,7 +210,7 @@ def create_activation_matching( ``` """ - qdq_cmp: Dict[str, Dict[str, Sequence[numpy.ndarray]]] = {} + qdq_cmp: dict[str, dict[str, Sequence[numpy.ndarray]]] = {} for tensor_name, tensors in qdq_activations.items(): if tensor_name.endswith(QUANT_INPUT_SUFFIX): pre_name = tensor_name[: -len(QUANT_INPUT_SUFFIX)] @@ -241,7 +241,7 @@ def create_activation_matching( def _run_dequantize_linear( weight_tensor: numpy.ndarray, weight_scale: numpy.ndarray, weight_zp: numpy.ndarray, channel_axis: int -) -> Optional[numpy.ndarray]: +) -> numpy.ndarray | None: assert weight_scale.shape == weight_zp.shape if weight_zp.size == 1: return (weight_tensor - weight_zp) * weight_scale @@ -267,7 +267,7 @@ def _run_dequantize_linear( return dequantized_weights -def create_weight_matching(float_model_path: str, qdq_model_path: str) -> Dict[str, Dict[str, numpy.ndarray]]: +def create_weight_matching(float_model_path: str, qdq_model_path: str) -> dict[str, dict[str, numpy.ndarray]]: """Comparing weight values to help debugging accuracy loss due to quantization. This functions takes the float model and the qdq model, and provides a data structure for comparing @@ -288,7 +288,7 @@ def create_weight_matching(float_model_path: str, qdq_model_path: str) -> Dict[s float_onnx_model = ONNXModel(load_model_with_shape_infer(Path(float_model_path))) qdq_onnx_model = ONNXModel(load_model_with_shape_infer(Path(qdq_model_path))) - matched_weights: Dict[str, Dict[str, numpy.ndarray]] = {} + matched_weights: dict[str, dict[str, numpy.ndarray]] = {} initializers = qdq_onnx_model.initializer() for node in qdq_onnx_model.nodes(): if node.op_type != DEQUANT_OP_NAME: @@ -339,7 +339,7 @@ def create_weight_matching(float_model_path: str, qdq_model_path: str) -> Dict[s def compute_signal_to_quantization_noice_ratio( - x: Union[Sequence[numpy.ndarray], numpy.ndarray], y: Union[Sequence[numpy.ndarray], numpy.ndarray] + x: Sequence[numpy.ndarray] | numpy.ndarray, y: Sequence[numpy.ndarray] | numpy.ndarray ) -> float: if isinstance(x, numpy.ndarray): xlist = [x] @@ -363,24 +363,24 @@ def compute_signal_to_quantization_noice_ratio( def compute_weight_error( - weights_match: Dict[str, Dict[str, numpy.ndarray]], + weights_match: dict[str, dict[str, numpy.ndarray]], err_func: Callable[[numpy.ndarray, numpy.ndarray], float] = compute_signal_to_quantization_noice_ratio, -) -> Dict[str, float]: - result: Dict[str, float] = {} +) -> dict[str, float]: + result: dict[str, float] = {} for weight_name, weight_match in weights_match.items(): result[weight_name] = err_func(weight_match["float"], weight_match["dequantized"]) return result def compute_activation_error( - activations_match: Dict[str, Dict[str, Sequence[numpy.ndarray]]], + activations_match: dict[str, dict[str, Sequence[numpy.ndarray]]], err_func: Callable[ [Sequence[numpy.ndarray], Sequence[numpy.ndarray]], float ] = compute_signal_to_quantization_noice_ratio, -) -> Dict[str, Dict[str, float]]: - result: Dict[str, Dict[str, float]] = {} +) -> dict[str, dict[str, float]]: + result: dict[str, dict[str, float]] = {} for name, match in activations_match.items(): - err_result: Dict[str, float] = {} + err_result: dict[str, float] = {} err_result["qdq_err"] = err_func(match["pre_qdq"], match["post_qdq"]) float_activation = match["float"] if float_activation: diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py index 4ffd8b9872982..27221f9445c30 100644 --- a/onnxruntime/python/tools/quantization/quantize.py +++ b/onnxruntime/python/tools/quantization/quantize.py @@ -8,8 +8,9 @@ import copy import logging import tempfile +from collections.abc import Callable from pathlib import Path -from typing import Any, Callable +from typing import Any import onnx diff --git a/onnxruntime/python/tools/quantization/shape_inference.py b/onnxruntime/python/tools/quantization/shape_inference.py index c07007f9d6129..63d34e1167de4 100644 --- a/onnxruntime/python/tools/quantization/shape_inference.py +++ b/onnxruntime/python/tools/quantization/shape_inference.py @@ -9,7 +9,6 @@ import tempfile import traceback from pathlib import Path -from typing import Optional, Union import onnx @@ -23,8 +22,8 @@ def quant_pre_process( - input_model: Optional[Union[str, Path, onnx.ModelProto]] = None, - output_model_path: Optional[Union[str, Path]] = None, + input_model: str | Path | onnx.ModelProto | None = None, + output_model_path: str | Path | None = None, skip_optimization: bool = False, skip_onnx_shape: bool = False, skip_symbolic_shape: bool = False, @@ -34,7 +33,7 @@ def quant_pre_process( verbose: int = 0, save_as_external_data: bool = False, all_tensors_to_one_file: bool = False, - external_data_location: Optional[str] = None, + external_data_location: str | None = None, external_data_size_threshold: int = 1024, **deprecated_kwargs, ) -> None: diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py index b9ff2159028d0..b1bf9c9d537e6 100755 --- a/onnxruntime/python/tools/symbolic_shape_infer.py +++ b/onnxruntime/python/tools/symbolic_shape_infer.py @@ -651,7 +651,7 @@ def _compute_on_sympy_data(self, node, op_func): is_list = [isinstance(v, list) for v in values] as_list = any(is_list) if as_list: - self.sympy_data_[node.output[0]] = [op_func(vs) for vs in zip(*values)] + self.sympy_data_[node.output[0]] = [op_func(vs) for vs in zip(*values, strict=False)] else: self.sympy_data_[node.output[0]] = op_func(values) @@ -722,21 +722,21 @@ def _compute_conv_pool_shape(self, node, channels_last=False): dilations = get_attribute(node, "dilations", [1] * rank) strides = get_attribute(node, "strides", [1] * rank) - effective_kernel_shape = [(k - 1) * d + 1 for k, d in zip(kernel_shape, dilations)] + effective_kernel_shape = [(k - 1) * d + 1 for k, d in zip(kernel_shape, dilations, strict=False)] pads = get_attribute(node, "pads") if pads is None: pads = [0] * (2 * rank) auto_pad = get_attribute(node, "auto_pad", b"NOTSET").decode("utf-8") if auto_pad != "VALID" and auto_pad != "NOTSET": try: - residual = [sympy.Mod(d, s) for d, s in zip(sympy_shape[-rank:], strides)] + residual = [sympy.Mod(d, s) for d, s in zip(sympy_shape[-rank:], strides, strict=False)] total_pads = [ max(0, (k - s) if r == 0 else (k - r)) - for k, s, r in zip(effective_kernel_shape, strides, residual) + for k, s, r in zip(effective_kernel_shape, strides, residual, strict=False) ] except TypeError: # sympy may throw TypeError: cannot determine truth value of Relational total_pads = [ - max(0, (k - s)) for k, s in zip(effective_kernel_shape, strides) + max(0, (k - s)) for k, s in zip(effective_kernel_shape, strides, strict=False) ] # assuming no residual if sympy throws error elif auto_pad == "VALID": total_pads = [] @@ -744,7 +744,7 @@ def _compute_conv_pool_shape(self, node, channels_last=False): total_pads = [0] * rank else: assert len(pads) == 2 * rank - total_pads = [p1 + p2 for p1, p2 in zip(pads[:rank], pads[rank:])] + total_pads = [p1 + p2 for p1, p2 in zip(pads[:rank], pads[rank:], strict=False)] ceil_mode = get_attribute(node, "ceil_mode", 0) for i in range(rank): @@ -815,7 +815,7 @@ def _fuse_tensor_type(self, node, out_idx, dst_type, src_type): f"{onnx.onnx_pb.TensorProto.DataType.Name(src_tensor_type.elem_type)}" ) if dst_tensor_type.HasField("shape"): - for di, ds in enumerate(zip(dst_tensor_type.shape.dim, src_tensor_type.shape.dim)): + for di, ds in enumerate(zip(dst_tensor_type.shape.dim, src_tensor_type.shape.dim, strict=False)): if ds[0] != ds[1]: # create a new symbolic dimension for node/out_idx/mismatch dim id in dst_tensor_type for tensor_type # for sequence_type, clear the dimension @@ -1222,7 +1222,7 @@ def _infer_Loop(self, node): # noqa: N802 else: si = subgraph.input[i_out + 1] si_shape = get_shape_from_value_info(si) - for di, dims in enumerate(zip(si_shape, so_shape)): + for di, dims in enumerate(zip(si_shape, so_shape, strict=False)): if dims[0] != dims[1]: new_dim = onnx.TensorShapeProto.Dimension() new_dim.dim_param = str(self._new_symbolic_dim_from_output(node, i_out, di)) @@ -1319,7 +1319,8 @@ def _infer_Pad(self, node): # noqa: N802 if pads is not None: assert len(pads) == 2 * rank new_sympy_shape = [ - d + pad_up + pad_down for d, pad_up, pad_down in zip(sympy_shape, pads[:rank], pads[rank:]) + d + pad_up + pad_down + for d, pad_up, pad_down in zip(sympy_shape, pads[:rank], pads[rank:], strict=False) ] self._update_computed_dims(new_sympy_shape) else: @@ -1679,7 +1680,9 @@ def _infer_Resize(self, node): # noqa: N802 if get_opset(self.out_mp_) <= 10: scales = self._try_get_value(node, 1) if scales is not None: - new_sympy_shape = [sympy.simplify(sympy.floor(d * s)) for d, s in zip(input_sympy_shape, scales)] + new_sympy_shape = [ + sympy.simplify(sympy.floor(d * s)) for d, s in zip(input_sympy_shape, scales, strict=False) + ] self._update_computed_dims(new_sympy_shape) vi.CopyFrom( helper.make_tensor_value_info( @@ -1707,7 +1710,7 @@ def _infer_Resize(self, node): # noqa: N802 scales = list(scales) new_sympy_shape = [ sympy.simplify(sympy.floor(d * (end - start) * scale)) - for d, start, end, scale in zip(input_sympy_shape, roi_start, roi_end, scales) + for d, start, end, scale in zip(input_sympy_shape, roi_start, roi_end, scales, strict=False) ] self._update_computed_dims(new_sympy_shape) else: @@ -1893,7 +1896,7 @@ def handle_negative_index(index, bound): for i in axes: new_sympy_shape[i] = self._new_symbolic_dim_from_output(node, 0, i) else: - for i, s, e, t in zip(axes, starts, ends, steps): + for i, s, e, t in zip(axes, starts, ends, steps, strict=False): e = handle_negative_index(e, new_sympy_shape[i]) # noqa: PLW2901 if is_literal(e): if e >= self.int_max_: @@ -2841,7 +2844,7 @@ def get_prereq(node): self._add_suggested_merge( [ s[i] if is_literal(s[i]) else str(s[i]) - for s, i in zip(shapes, dim_idx) + for s, i in zip(shapes, dim_idx, strict=False) if i >= 0 ] ) diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark.py b/onnxruntime/python/tools/tensorrt/perf/benchmark.py index 4fa5d0c0ea034..2152a66d1f2e7 100644 --- a/onnxruntime/python/tools/tensorrt/perf/benchmark.py +++ b/onnxruntime/python/tools/tensorrt/perf/benchmark.py @@ -607,7 +607,7 @@ def validate(all_ref_outputs, all_outputs, rtol, atol, percent_mismatch): output = outputs[j] # Compare the results with reference outputs - for ref_o, o in zip(ref_output, output): + for ref_o, o in zip(ref_output, output, strict=False): # abs(desired-actual) < rtol * abs(desired) + atol try: np.testing.assert_allclose(ref_o, o, rtol, atol) diff --git a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py index 541dc4978dad1..0384300b99445 100644 --- a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py +++ b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py @@ -12,7 +12,6 @@ import shlex import subprocess import sys -from typing import List, Optional TRT_DOCKER_FILES = { "8.6_cuda11.8_cudnn8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6", @@ -23,7 +22,7 @@ } -def run_cmd(cmd: List[str]) -> Optional[int]: +def run_cmd(cmd: list[str]) -> int | None: """ Runs a shell command and returns the process's return code. @@ -38,7 +37,7 @@ def run_cmd(cmd: List[str]) -> Optional[int]: return pty.spawn(cmd) -def get_common_docker_build_args(args: argparse.Namespace) -> List[str]: +def get_common_docker_build_args(args: argparse.Namespace) -> list[str]: """ Returns a list of common 'docker build' command-line arguments/options. diff --git a/onnxruntime/python/tools/tensorrt/perf/parse_mem_concurrency_test.py b/onnxruntime/python/tools/tensorrt/perf/parse_mem_concurrency_test.py index 492de13fb42b5..b308066edacad 100644 --- a/onnxruntime/python/tools/tensorrt/perf/parse_mem_concurrency_test.py +++ b/onnxruntime/python/tools/tensorrt/perf/parse_mem_concurrency_test.py @@ -103,7 +103,7 @@ def parse_concurrency_test_log(input_path, output_path): # Parse mem_test log logs = ["valgrind.log", "concurrency_test.log"] csv_paths = ["mem_test.csv", "concurrency_test.csv"] - for log, csv_path in zip(logs, csv_paths): + for log, csv_path in zip(logs, csv_paths, strict=False): if os.path.exists(log): print(f"{identifier}: Parsing {log}") if log == logs[0]: @@ -112,7 +112,9 @@ def parse_concurrency_test_log(input_path, output_path): parse_concurrency_test_log(log, csv_path) # Upload to db - for csv_path, db_table_name in zip(csv_paths, ["ep_valgrind_record", "ep_concurrencytest_record"]): + for csv_path, db_table_name in zip( + csv_paths, ["ep_valgrind_record", "ep_concurrencytest_record"], strict=False + ): if os.path.exists(csv_path): table = pd.read_csv(csv_path) write_table( diff --git a/onnxruntime/python/tools/transformers/benchmark_helper.py b/onnxruntime/python/tools/transformers/benchmark_helper.py index d88e689521593..2a210729112d7 100644 --- a/onnxruntime/python/tools/transformers/benchmark_helper.py +++ b/onnxruntime/python/tools/transformers/benchmark_helper.py @@ -16,7 +16,7 @@ from datetime import datetime from enum import Enum from time import sleep -from typing import Any, Dict, List, Optional +from typing import Any import coloredlogs import numpy @@ -405,7 +405,7 @@ def set_random_seed(seed=123): # torch.backends.cudnn.deterministic = True -def get_gpu_info() -> Optional[List[Dict[str, Any]]]: +def get_gpu_info() -> list[dict[str, Any]] | None: from py3nvml.py3nvml import ( NVMLError, nvmlDeviceGetCount, @@ -459,7 +459,7 @@ def measure_cpu_usage(self): return max_usage @abstractmethod - def measure_gpu_usage(self) -> Optional[List[Dict[str, Any]]]: + def measure_gpu_usage(self) -> list[dict[str, Any]] | None: raise NotImplementedError() @@ -467,7 +467,7 @@ class CudaMemoryMonitor(MemoryMonitor): def __init__(self, keep_measuring=True): super().__init__(keep_measuring) - def measure_gpu_usage(self) -> Optional[List[Dict[str, Any]]]: + def measure_gpu_usage(self) -> list[dict[str, Any]] | None: from py3nvml.py3nvml import ( NVMLError, nvmlDeviceGetCount, diff --git a/onnxruntime/python/tools/transformers/bert_perf_test.py b/onnxruntime/python/tools/transformers/bert_perf_test.py index 17c5d3602bb3b..c506bf4539173 100644 --- a/onnxruntime/python/tools/transformers/bert_perf_test.py +++ b/onnxruntime/python/tools/transformers/bert_perf_test.py @@ -23,7 +23,6 @@ from dataclasses import dataclass from datetime import datetime from pathlib import Path -from typing import Optional import numpy as np import psutil @@ -55,8 +54,8 @@ class ModelSetting: segment_ids_name: str input_mask_name: str opt_level: int - input_tuning_results: Optional[str] - output_tuning_results: Optional[str] + input_tuning_results: str | None + output_tuning_results: str | None mask_type: int diff --git a/onnxruntime/python/tools/transformers/bert_test_data.py b/onnxruntime/python/tools/transformers/bert_test_data.py index ccf2497d61342..55a4e4e5824ed 100644 --- a/onnxruntime/python/tools/transformers/bert_test_data.py +++ b/onnxruntime/python/tools/transformers/bert_test_data.py @@ -10,7 +10,6 @@ import os import random from pathlib import Path -from typing import Dict, Optional, Tuple import numpy as np from onnx import ModelProto, TensorProto, numpy_helper @@ -157,7 +156,7 @@ def fake_input_mask_data( return data -def output_test_data(directory: str, inputs: Dict[str, np.ndarray]): +def output_test_data(directory: str, inputs: dict[str, np.ndarray]): """Output input tensors of test data to a directory Args: @@ -305,10 +304,10 @@ def get_graph_input_from_embed_node(onnx_model, embed_node, input_index): def find_bert_inputs( onnx_model: OnnxModel, - input_ids_name: Optional[str] = None, - segment_ids_name: Optional[str] = None, - input_mask_name: Optional[str] = None, -) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], Optional[np.ndarray]]: + input_ids_name: str | None = None, + segment_ids_name: str | None = None, + input_mask_name: str | None = None, +) -> tuple[np.ndarray | None, np.ndarray | None, np.ndarray | None]: """Find graph inputs for BERT model. First, we will deduce inputs from EmbedLayerNormalization node. If not found, we will guess the meaning of graph inputs based on naming. @@ -397,10 +396,10 @@ def find_bert_inputs( def get_bert_inputs( onnx_file: str, - input_ids_name: Optional[str] = None, - segment_ids_name: Optional[str] = None, - input_mask_name: Optional[str] = None, -) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], Optional[np.ndarray]]: + input_ids_name: str | None = None, + segment_ids_name: str | None = None, + input_mask_name: str | None = None, +) -> tuple[np.ndarray | None, np.ndarray | None, np.ndarray | None]: """Find graph inputs for BERT model. First, we will deduce inputs from EmbedLayerNormalization node. If not found, we will guess the meaning of graph inputs based on naming. @@ -531,9 +530,9 @@ def create_and_save_test_data( test_cases: int, seed: int, verbose: bool, - input_ids_name: Optional[str], - segment_ids_name: Optional[str], - input_mask_name: Optional[str], + input_ids_name: str | None, + segment_ids_name: str | None, + input_mask_name: str | None, only_input_tensors: bool, average_sequence_length: int, random_sequence_length: bool, diff --git a/onnxruntime/python/tools/transformers/convert_generation.py b/onnxruntime/python/tools/transformers/convert_generation.py index 5a26fedb5287d..68bf9e9e69059 100644 --- a/onnxruntime/python/tools/transformers/convert_generation.py +++ b/onnxruntime/python/tools/transformers/convert_generation.py @@ -48,7 +48,7 @@ import time from enum import Enum from pathlib import Path -from typing import Any, Dict, List, Optional, Union +from typing import Any import numpy as np import onnx @@ -86,7 +86,7 @@ def __str__(self): return self.value -def parse_arguments(argv: Optional[List[str]] = None) -> argparse.Namespace: +def parse_arguments(argv: list[str] | None = None) -> argparse.Namespace: """Parse arguments Args: @@ -883,8 +883,8 @@ def remove_shared_initializers( graph2: GraphProto, shared_prefix: str = "shared_", min_elements: int = 1024, - signature_cache1: Optional[dict] = None, - signature_cache2: Optional[dict] = None, + signature_cache1: dict | None = None, + signature_cache2: dict | None = None, ): """Remove initializers with same value from two graphs. @@ -1005,7 +1005,7 @@ def get_shared_initializers(encoder_model: ModelProto, decoder_model: ModelProto def move_initializers( graph: GraphProto, min_elements: int = 1024, -) -> List[TensorProto]: +) -> list[TensorProto]: """Remove initializers of a graph, when they have number of elements larger than a threshold. Args: @@ -2585,13 +2585,13 @@ def convert_generation_model(args: argparse.Namespace, generation_type: Generati def test_torch_performance( args: argparse.Namespace, - model: Union[GPT2LMHeadModel, T5ForConditionalGeneration], + model: GPT2LMHeadModel | T5ForConditionalGeneration, input_ids: torch.Tensor, attention_mask: torch.Tensor, eos_token_id: int, pad_token_id: int, - bad_words_ids: List[List[int]], -) -> Dict[str, Any]: + bad_words_ids: list[list[int]], +) -> dict[str, Any]: """Test PyTorch performance of text generation. Args: @@ -2661,7 +2661,7 @@ def create_attention_mask(input_ids, pad_token_id): return attention_mask -def test_gpt_model(args: argparse.Namespace, sentences: Optional[List[str]] = None, is_greedy: bool = False): +def test_gpt_model(args: argparse.Namespace, sentences: list[str] | None = None, is_greedy: bool = False): """Test GPT-2 model Args: @@ -2872,7 +2872,7 @@ def test_gpt_model(args: argparse.Namespace, sentences: Optional[List[str]] = No return output -def test_t5_model(args: argparse.Namespace, sentences: Optional[List[str]] = None): +def test_t5_model(args: argparse.Namespace, sentences: list[str] | None = None): """Test T5 or MT5 model Args: @@ -3061,7 +3061,7 @@ def test_t5_model(args: argparse.Namespace, sentences: Optional[List[str]] = Non return output -def main(argv: Optional[List[str]] = None, sentences: Optional[List[str]] = None): +def main(argv: list[str] | None = None, sentences: list[str] | None = None): """Main entry function Args: diff --git a/onnxruntime/python/tools/transformers/convert_to_packing_mode.py b/onnxruntime/python/tools/transformers/convert_to_packing_mode.py index e854312cae826..9a6388b3f350d 100644 --- a/onnxruntime/python/tools/transformers/convert_to_packing_mode.py +++ b/onnxruntime/python/tools/transformers/convert_to_packing_mode.py @@ -6,7 +6,6 @@ import argparse import logging import os -from typing import List, Union import coloredlogs from constants import ( @@ -26,15 +25,15 @@ class PackingAttentionBase: def __init__(self, model: OnnxModel, attention_op_type: str): self.model: OnnxModel = model - self.nodes_to_remove: List = [] - self.nodes_to_add: List = [] + self.nodes_to_remove: list = [] + self.nodes_to_add: list = [] self.prune_graph: bool = False self.node_name_to_graph_name: dict = {} self.this_graph_name: str = self.model.model.graph.name self.attention_op_type = attention_op_type self.attention_nodes = self.model.get_nodes_by_op_type(attention_op_type) - def _try_getting_attention_mask(self) -> Union[str, None]: + def _try_getting_attention_mask(self) -> str | None: mask_index = ( AttentionInputIDs.MASK_INDEX if self.attention_op_type == Operators.ATTENTION @@ -54,13 +53,13 @@ def _try_getting_attention_mask(self) -> Union[str, None]: return attention_mask - def _try_getting_first_attention(self) -> Union[NodeProto, None]: + def _try_getting_first_attention(self) -> NodeProto | None: if len(self.attention_nodes) <= 0: return None return self.attention_nodes[0] - def _try_getting_last_layernorm(self) -> Union[NodeProto, None]: + def _try_getting_last_layernorm(self) -> NodeProto | None: last_layernorm_node = None for node in self.model.nodes(): if node.op_type == Operators.LAYERNORM or node.op_type == Operators.SKIPLAYERNORM: @@ -70,7 +69,7 @@ def _try_getting_last_layernorm(self) -> Union[NodeProto, None]: def _are_attentions_supported(self) -> bool: raise NotImplementedError() - def _insert_removepadding_node(self, inputs: List[str], outputs: List[str]) -> None: + def _insert_removepadding_node(self, inputs: list[str], outputs: list[str]) -> None: new_node = helper.make_node( Operators.REMOVEPADDING, inputs=inputs, @@ -82,7 +81,7 @@ def _insert_removepadding_node(self, inputs: List[str], outputs: List[str]) -> N self.nodes_to_add.append(new_node) self.node_name_to_graph_name[new_node.name] = self.this_graph_name - def _insert_restorepadding_node(self, inputs: List[str], outputs: List[str]) -> None: + def _insert_restorepadding_node(self, inputs: list[str], outputs: list[str]) -> None: new_node = helper.make_node( Operators.RESTOREPADDING, inputs=inputs, @@ -97,7 +96,7 @@ def _insert_restorepadding_node(self, inputs: List[str], outputs: List[str]) -> def _replace_attention_with_packing_attention(self, token_offset: str, cumulative_sequence_length: str) -> None: raise NotImplementedError() - def _get_input_to_remove_padding(self, first_attention_node) -> Union[str, None]: + def _get_input_to_remove_padding(self, first_attention_node) -> str | None: if self.attention_op_type == Operators.ATTENTION: return first_attention_node.input[AttentionInputIDs.INPUT] return None @@ -306,7 +305,7 @@ def _replace_attention_with_packing_attention(self, token_offset: str, cumulativ logger.info("Converted %d MultiHeadAttention nodes to PackedMultiHeadAttention.", len(self.attention_nodes)) logger.info("Converted %d GatedRelativePositionBias nodes to packing mode.", gated_relative_pos_bias_count) - def _get_input_to_remove_padding(self, first_attention_node) -> Union[str, None]: + def _get_input_to_remove_padding(self, first_attention_node) -> str | None: # When there are query, key and value inputs, we need to find the first input of the parent MatMul node. matmul = self.model.get_parent(first_attention_node, 0) if matmul and matmul.op_type == "MatMul": diff --git a/onnxruntime/python/tools/transformers/float16.py b/onnxruntime/python/tools/transformers/float16.py index e9ac4a64f9fe5..349f5bb51fe47 100644 --- a/onnxruntime/python/tools/transformers/float16.py +++ b/onnxruntime/python/tools/transformers/float16.py @@ -16,7 +16,6 @@ import logging import os import tempfile -from typing import Dict import numpy as np import onnx @@ -304,7 +303,7 @@ def convert_float_to_float16( value_info_list.append(new_value_info) io_casts.add(node_name) - fp32_initializers: Dict[str, InitializerTracker] = {} + fp32_initializers: dict[str, InitializerTracker] = {} while queue: next_level = [] for q in queue: diff --git a/onnxruntime/python/tools/transformers/fusion_attention.py b/onnxruntime/python/tools/transformers/fusion_attention.py index 56b5ae93e7221..c02cf5cbb4e54 100644 --- a/onnxruntime/python/tools/transformers/fusion_attention.py +++ b/onnxruntime/python/tools/transformers/fusion_attention.py @@ -3,7 +3,6 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- from logging import getLogger -from typing import List, Optional, Tuple, Union import numpy as np from fusion_base import Fusion @@ -42,7 +41,7 @@ def get_first_mask(self): assert len(self.mask_indice) > 0 return next(iter(self.mask_indice)) - def process_mask(self, mask_2d: str) -> Optional[str]: + def process_mask(self, mask_2d: str) -> str | None: if self.mask_format == AttentionMaskFormat.NoMask: return None @@ -111,10 +110,10 @@ def __init__( model: OnnxModel, hidden_size: int, num_heads: int, - attention_mask: Optional[AttentionMask] = None, + attention_mask: AttentionMask | None = None, use_multi_head_attention: bool = False, disable_multi_head_attention_bias: bool = False, - search_op_types: List[str] = ["SkipLayerNormalization", "LayerNormalization"], # noqa: B006 + search_op_types: list[str] = ["SkipLayerNormalization", "LayerNormalization"], # noqa: B006 ): attention_op_name = "MultiHeadAttention" if use_multi_head_attention else "Attention" super().__init__(model, attention_op_name, search_op_types) @@ -132,7 +131,7 @@ def __init__( self.shape_infer = None self.shape_infer_done = True - def get_num_heads_and_hidden_size_from_concat(self, concat: NodeProto) -> Tuple[int, int]: + def get_num_heads_and_hidden_size_from_concat(self, concat: NodeProto) -> tuple[int, int]: """ Detect num_heads and hidden_size from Concat node in the following subgraph: @@ -163,7 +162,7 @@ def get_num_heads_and_hidden_size_from_concat(self, concat: NodeProto) -> Tuple[ return self.num_heads, self.hidden_size - def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]: + def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> tuple[int, int]: """Detect num_heads and hidden_size from a reshape node. Args: @@ -358,10 +357,10 @@ def split_kv(self, present_k_name: str, present_v_name: str, kv_node: str): def create_combined_qkv_bias( self, q_add: NodeProto, - k_add: Union[NodeProto, None], - v_add: Union[NodeProto, None], + k_add: NodeProto | None, + v_add: NodeProto | None, name_prefix: str, - ) -> Union[NodeProto, None]: + ) -> NodeProto | None: q_bias = self.model.get_initializer(q_add.input[1]) or self.model.get_initializer(q_add.input[0]) qb = NumpyHelper.to_array(q_bias) kb = np.zeros_like(qb) @@ -391,9 +390,9 @@ def create_packed_qkv_matmul_node( k_matmul: NodeProto, v_matmul: NodeProto, q_add: NodeProto, - k_add: Union[NodeProto, None], - v_add: Union[NodeProto, None], - ) -> Tuple[NodeProto, NodeProto, NodeProto]: + k_add: NodeProto | None, + v_add: NodeProto | None, + ) -> tuple[NodeProto, NodeProto, NodeProto]: """Create packed QKV MatMul node before MultiHeadAttention node. This is for the scenario where an Attention node should be created but cannot be created because past_key and past_value are separate inputs and not one concatenated input. @@ -532,11 +531,11 @@ def create_packed_qkv_matmul_node( def create_multihead_attention_node( self, q_matmul: NodeProto, - k_matmul: Union[NodeProto, str, None], - v_matmul: Union[NodeProto, str, None], + k_matmul: NodeProto | str | None, + v_matmul: NodeProto | str | None, q_add: NodeProto, - k_add: Union[NodeProto, None], - v_add: Union[NodeProto, None], + k_add: NodeProto | None, + v_add: NodeProto | None, num_heads: int, hidden_size: int, output: str, @@ -547,7 +546,7 @@ def create_multihead_attention_node( present_k: str = "", present_v: str = "", packed_qkv: bool = False, - ) -> Union[NodeProto, None]: + ) -> NodeProto | None: """Create a MultiHeadAttention node. Args: @@ -647,7 +646,7 @@ def create_multihead_attention_node( def create_attention_node( self, - mask_index: Optional[str], + mask_index: str | None, q_matmul: NodeProto, k_matmul: NodeProto, v_matmul: NodeProto, @@ -663,9 +662,9 @@ def create_attention_node( past_v: str = "", present_k: str = "", present_v: str = "", - scale: Optional[float] = None, + scale: float | None = None, causal: bool = False, - ) -> Union[NodeProto, None]: + ) -> NodeProto | None: """Create an Attention node. Args: @@ -762,7 +761,7 @@ def create_attention_node( qkv_weight_dim = 3 * qw_out_size qkv_bias_dim = 0 - qkv_bias: Optional[np.ndarray] = None + qkv_bias: np.ndarray | None = None if has_bias: qb = NumpyHelper.to_array(q_bias) kb = NumpyHelper.to_array(k_bias) diff --git a/onnxruntime/python/tools/transformers/fusion_attention_clip.py b/onnxruntime/python/tools/transformers/fusion_attention_clip.py index 16e2c36bfd092..a4a7a5c8c1890 100644 --- a/onnxruntime/python/tools/transformers/fusion_attention_clip.py +++ b/onnxruntime/python/tools/transformers/fusion_attention_clip.py @@ -3,7 +3,6 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- from logging import getLogger -from typing import Tuple from fusion_attention import AttentionMask, FusionAttention from fusion_options import AttentionMaskFormat @@ -36,7 +35,7 @@ def __init__( search_op_types=["SkipLayerNormalization"], ) - def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]: + def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> tuple[int, int]: """Detect num_heads and hidden_size for ONNX model from MiDaS Args: reshape_q (NodeProto): reshape node for q diff --git a/onnxruntime/python/tools/transformers/fusion_attention_sam2.py b/onnxruntime/python/tools/transformers/fusion_attention_sam2.py index ce7ddd3c1050e..f66d7d12d1e5f 100644 --- a/onnxruntime/python/tools/transformers/fusion_attention_sam2.py +++ b/onnxruntime/python/tools/transformers/fusion_attention_sam2.py @@ -3,7 +3,6 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- from logging import getLogger -from typing import Tuple, Union import numpy as np from fusion_base import Fusion @@ -97,7 +96,7 @@ def get_hidden_size(self, layernorm_node): def get_num_heads_and_hidden_size( self, reshape_q: NodeProto, layernorm_node: NodeProto, is_encoder: bool = False - ) -> Tuple[int, int]: + ) -> tuple[int, int]: """Detect num_heads and hidden_size. Args: @@ -142,7 +141,7 @@ def create_attention_node( num_heads: int, hidden_size: int, output: str, - ) -> Union[NodeProto, None]: + ) -> NodeProto | None: """Create an Attention node. Args: diff --git a/onnxruntime/python/tools/transformers/fusion_attention_unet.py b/onnxruntime/python/tools/transformers/fusion_attention_unet.py index 9a353e7e2d675..1bdf4f24f3621 100644 --- a/onnxruntime/python/tools/transformers/fusion_attention_unet.py +++ b/onnxruntime/python/tools/transformers/fusion_attention_unet.py @@ -3,7 +3,6 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- from logging import getLogger -from typing import Tuple, Union import numpy as np from fusion_base import Fusion @@ -91,7 +90,7 @@ def get_hidden_size(self, layernorm_node): def get_num_heads_and_hidden_size( self, reshape_q: NodeProto, layernorm_node: NodeProto, is_torch2: bool = False - ) -> Tuple[int, int]: + ) -> tuple[int, int]: """Detect num_heads and hidden_size. Args: @@ -132,7 +131,7 @@ def create_attention_node( hidden_size: int, input: str, output: str, - ) -> Union[NodeProto, None]: + ) -> NodeProto | None: """Create an Attention node. Args: @@ -390,7 +389,7 @@ def create_attention_node_lora( hidden_size: int, input: str, output: str, - ) -> Union[NodeProto, None]: + ) -> NodeProto | None: """Create an Attention node. Args: diff --git a/onnxruntime/python/tools/transformers/fusion_attention_vae.py b/onnxruntime/python/tools/transformers/fusion_attention_vae.py index 151c04f9334fe..2b57fa2c418cf 100644 --- a/onnxruntime/python/tools/transformers/fusion_attention_vae.py +++ b/onnxruntime/python/tools/transformers/fusion_attention_vae.py @@ -3,7 +3,6 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- from logging import getLogger -from typing import Tuple, Union import numpy as np from fusion_base import Fusion @@ -27,7 +26,7 @@ def __init__(self, model: OnnxModel, hidden_size: int, num_heads: int): self.num_heads_warning = True self.hidden_size_warning = True - def get_num_heads_and_hidden_size(self, reshape_q: NodeProto, add_q: NodeProto) -> Tuple[int, int]: + def get_num_heads_and_hidden_size(self, reshape_q: NodeProto, add_q: NodeProto) -> tuple[int, int]: """Detect num_heads and hidden_size from a reshape node. Args: @@ -80,7 +79,7 @@ def create_attention_node( hidden_size: int, input_name: str, output_name: str, - ) -> Union[NodeProto, None]: + ) -> NodeProto | None: """Create an Attention node. Args: diff --git a/onnxruntime/python/tools/transformers/fusion_base.py b/onnxruntime/python/tools/transformers/fusion_base.py index 67f4f0b55cff8..a923e14c493f4 100644 --- a/onnxruntime/python/tools/transformers/fusion_base.py +++ b/onnxruntime/python/tools/transformers/fusion_base.py @@ -3,8 +3,9 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- from collections import defaultdict +from collections.abc import Sequence from logging import getLogger -from typing import Any, Dict, List, Optional, Sequence, Union +from typing import Any import numpy as np from onnx import NodeProto, helper @@ -22,18 +23,18 @@ def __init__( self, model: OnnxModel, fused_op_type: str, - search_op_types: Union[str, List[str]], + search_op_types: str | list[str], description: str = "", ): - self.search_op_types: List[str] = [search_op_types] if isinstance(search_op_types, str) else search_op_types + self.search_op_types: list[str] = [search_op_types] if isinstance(search_op_types, str) else search_op_types self.fused_op_type: str = fused_op_type self.description: str = f"{fused_op_type}({description})" if description else fused_op_type self.model: OnnxModel = model - self.nodes_to_remove: List = [] - self.nodes_to_add: List = [] + self.nodes_to_remove: list = [] + self.nodes_to_add: list = [] self.prune_graph: bool = False self.node_name_to_graph_name: dict = {} - self.this_graph_name: Optional[str] = None + self.this_graph_name: str | None = None # It is optional that subclass updates fused_count since we will also check nodes_to_add to get counter. self.fused_count: defaultdict = defaultdict(int) @@ -46,8 +47,8 @@ def increase_counter(self, fused_op_name: str): def fuse( self, node: NodeProto, - input_name_to_nodes: Dict[str, List[NodeProto]], - output_name_to_node: Dict[str, NodeProto], + input_name_to_nodes: dict[str, list[NodeProto]], + output_name_to_node: dict[str, NodeProto], ): """Interface for fusion that starts from a node""" raise NotImplementedError @@ -114,7 +115,7 @@ def add_initializer(self, name: str, data_type: int, dims: Sequence[int], vals: self.model.add_initializer(tensor, self.this_graph_name) return tensor - def add_nodes_to_remove(self, nodes: List[NodeProto]): + def add_nodes_to_remove(self, nodes: list[NodeProto]): # Some nodes are shared between paths (e.g. rotary embedding nodes in the Q and K paths). # When path A is fused, its shared nodes are added to `self.nodes_to_remove`. But when path B # is fused, its shared nodes are also added to `self.nodes_to_remove`. When the nodes are @@ -131,7 +132,7 @@ def add_nodes_to_remove(self, nodes: List[NodeProto]): if node not in self.nodes_to_remove: self.nodes_to_remove.append(node) - def add_nodes_to_remove_with_nodes_to_keep(self, nodes: List[NodeProto], nodes_to_keep: List[NodeProto]): + def add_nodes_to_remove_with_nodes_to_keep(self, nodes: list[NodeProto], nodes_to_keep: list[NodeProto]): for node in nodes: if node not in self.nodes_to_remove and node not in nodes_to_keep: self.nodes_to_remove.append(node) diff --git a/onnxruntime/python/tools/transformers/fusion_bias_add.py b/onnxruntime/python/tools/transformers/fusion_bias_add.py index 8489af0940983..1cb4edad04ffe 100644 --- a/onnxruntime/python/tools/transformers/fusion_bias_add.py +++ b/onnxruntime/python/tools/transformers/fusion_bias_add.py @@ -3,7 +3,6 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- from logging import getLogger -from typing import Dict from fusion_base import Fusion from numpy import ndarray @@ -17,7 +16,7 @@ class FusionBiasAdd(Fusion): def __init__(self, model: OnnxModel): super().__init__(model, "BiasAdd", "Add") - def fuse(self, add_node, input_name_to_nodes: Dict, output_name_to_node: Dict): + def fuse(self, add_node, input_name_to_nodes: dict, output_name_to_node: dict): """ Fuse Add bias and Add skip connection into BiasAdd """ diff --git a/onnxruntime/python/tools/transformers/fusion_biassplitgelu.py b/onnxruntime/python/tools/transformers/fusion_biassplitgelu.py index 67a7c0fb9ceb3..1118809fdf6d3 100644 --- a/onnxruntime/python/tools/transformers/fusion_biassplitgelu.py +++ b/onnxruntime/python/tools/transformers/fusion_biassplitgelu.py @@ -3,7 +3,6 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- from logging import getLogger -from typing import Dict from fusion_base import Fusion from onnx import helper @@ -16,7 +15,7 @@ class FusionBiasSplitGelu(Fusion): def __init__(self, model: OnnxModel): super().__init__(model, "BiasSplitGelu", "Gelu") - def fuse(self, gelu_node, input_name_to_nodes: Dict, output_name_to_node: Dict): + def fuse(self, gelu_node, input_name_to_nodes: dict, output_name_to_node: dict): """ [root] --->Add --------------------> Slice ---------------> Mul --> | ^ ^ diff --git a/onnxruntime/python/tools/transformers/fusion_embedlayer.py b/onnxruntime/python/tools/transformers/fusion_embedlayer.py index 70ff57f0626e1..66ef06097aa58 100644 --- a/onnxruntime/python/tools/transformers/fusion_embedlayer.py +++ b/onnxruntime/python/tools/transformers/fusion_embedlayer.py @@ -4,7 +4,6 @@ # -------------------------------------------------------------------------- from logging import getLogger -from typing import Dict, List, Optional, Tuple, Union from fusion_base import Fusion from fusion_utils import FusionUtils @@ -35,7 +34,7 @@ def __init__(self, model: OnnxModel, description: str = "no mask"): self.attention = None self.embed_node = None - def match_two_gather(self, add: NodeProto) -> Union[None, Tuple[NodeProto, NodeProto]]: + def match_two_gather(self, add: NodeProto) -> None | tuple[NodeProto, NodeProto]: gather_0_path = self.model.match_parent_path(add, ["Gather"], [0]) if gather_0_path is None: return None @@ -49,7 +48,7 @@ def match_two_gather(self, add: NodeProto) -> Union[None, Tuple[NodeProto, NodeP def check_attention_subgraph( self, layernorm: NodeProto, - input_name_to_nodes: Dict[str, List[NodeProto]], + input_name_to_nodes: dict[str, list[NodeProto]], is_distil_bert: bool, ) -> bool: """Check that LayerNormalization has a child of Attention node or subgraph like Attention. @@ -399,7 +398,7 @@ def check_embedding(self, word_embedding_gather, segment_embedding_gather, posit return True - def cast_to_int32(self, input_name: str) -> Tuple[str, Union[None, NodeProto]]: + def cast_to_int32(self, input_name: str) -> tuple[str, None | NodeProto]: """Cast a graph input or node input to int32. Args: @@ -428,8 +427,8 @@ def create_fused_node( layernorm: NodeProto, word_embedding_gather: NodeProto, position_embedding_gather: NodeProto, - segment_embedding_gather: Union[None, NodeProto], - position_ids: Optional[str] = None, + segment_embedding_gather: None | NodeProto, + position_ids: str | None = None, embedding_sum_output=False, embedding_sum_name=None, ): diff --git a/onnxruntime/python/tools/transformers/fusion_fastgelu.py b/onnxruntime/python/tools/transformers/fusion_fastgelu.py index e2bb8027c8608..99f716193adb6 100644 --- a/onnxruntime/python/tools/transformers/fusion_fastgelu.py +++ b/onnxruntime/python/tools/transformers/fusion_fastgelu.py @@ -3,7 +3,6 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- from logging import getLogger -from typing import Dict, Optional from fusion_base import Fusion from onnx import helper @@ -16,7 +15,7 @@ class FusionFastGelu(Fusion): def __init__(self, model: OnnxModel): super().__init__(model, "FastGelu", "Tanh") - def fuse(self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict): + def fuse(self, tanh_node, input_name_to_nodes: dict, output_name_to_node: dict): if self.fuse_1(tanh_node, input_name_to_nodes, output_name_to_node): return @@ -29,7 +28,7 @@ def fuse(self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict): if self.fuse_4(tanh_node, input_name_to_nodes, output_name_to_node): return - def fuse_1(self, tanh_node, input_name_to_nodes, output_name_to_node) -> Optional[bool]: + def fuse_1(self, tanh_node, input_name_to_nodes, output_name_to_node) -> bool | None: """ Fuse Gelu with tanh into one node: +---------------------------+ @@ -137,7 +136,7 @@ def fuse_1(self, tanh_node, input_name_to_nodes, output_name_to_node) -> Optiona self.node_name_to_graph_name[fused_node.name] = self.this_graph_name return True - def fuse_2(self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]: + def fuse_2(self, tanh_node, input_name_to_nodes: dict, output_name_to_node: dict) -> bool | None: """ This pattern is from Tensorflow model. Fuse Gelu with tanh into one node: @@ -246,7 +245,7 @@ def fuse_2(self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict self.node_name_to_graph_name[fused_node.name] = self.this_graph_name return True - def fuse_3(self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]: + def fuse_3(self, tanh_node, input_name_to_nodes: dict, output_name_to_node: dict) -> bool | None: """ OpenAI's gelu implementation, also used in Megatron: Gelu(x) = x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1.0 + 0.044715 * x * x))) @@ -362,7 +361,7 @@ def fuse_3(self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict self.node_name_to_graph_name[fused_node.name] = self.this_graph_name return True - def fuse_4(self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]: + def fuse_4(self, tanh_node, input_name_to_nodes: dict, output_name_to_node: dict) -> bool | None: """ This pattern is from stable diffusion 3.5 model. Fuse Gelu with tanh into one node: diff --git a/onnxruntime/python/tools/transformers/fusion_gelu.py b/onnxruntime/python/tools/transformers/fusion_gelu.py index 6be5140c070d0..12f7d82a9c0af 100644 --- a/onnxruntime/python/tools/transformers/fusion_gelu.py +++ b/onnxruntime/python/tools/transformers/fusion_gelu.py @@ -3,7 +3,6 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- from logging import getLogger -from typing import Dict, Optional from fusion_base import Fusion from onnx import helper @@ -16,14 +15,14 @@ class FusionGelu(Fusion): def __init__(self, model: OnnxModel): super().__init__(model, "Gelu", "Erf") - def fuse(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict): + def fuse(self, erf_node, input_name_to_nodes: dict, output_name_to_node: dict): if self.fuse_1(erf_node, input_name_to_nodes, output_name_to_node): return if self.fuse_2(erf_node, input_name_to_nodes, output_name_to_node): return self.fuse_3(erf_node, input_name_to_nodes, output_name_to_node) - def fuse_1(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]: + def fuse_1(self, erf_node, input_name_to_nodes: dict, output_name_to_node: dict) -> bool | None: """ This pattern is from PyTorch model Fuse Gelu with Erf into one node: @@ -107,7 +106,7 @@ def fuse_1(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict) self.increase_counter("Gelu") return True - def fuse_2(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]: + def fuse_2(self, erf_node, input_name_to_nodes: dict, output_name_to_node: dict) -> bool | None: """ This pattern is from Keras model Fuse Gelu with Erf into one node: @@ -184,7 +183,7 @@ def fuse_2(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict) self.increase_counter("Gelu") return True - def fuse_3(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]: + def fuse_3(self, erf_node, input_name_to_nodes: dict, output_name_to_node: dict) -> bool | None: """ This pattern is from TensorFlow model Fuse Gelu with Erf into one node: diff --git a/onnxruntime/python/tools/transformers/fusion_gemmfastgelu.py b/onnxruntime/python/tools/transformers/fusion_gemmfastgelu.py index 4d9913f427b37..23eee1413ff9f 100644 --- a/onnxruntime/python/tools/transformers/fusion_gemmfastgelu.py +++ b/onnxruntime/python/tools/transformers/fusion_gemmfastgelu.py @@ -4,7 +4,6 @@ # -------------------------------------------------------------------------- from logging import getLogger -from typing import Dict, List, Union from fusion_base import Fusion from fusion_utils import NumpyHelper @@ -20,13 +19,13 @@ def __init__(self, model: OnnxModel): self.shape_infer = None self.shape_infer_done = False - def get_dimensions_from_tensor_proto(self, tensor_proto: TensorProto) -> Union[int, None]: + def get_dimensions_from_tensor_proto(self, tensor_proto: TensorProto) -> int | None: if tensor_proto.type.tensor_type.HasField("shape"): return len(tensor_proto.type.tensor_type.shape.dim) else: return None - def get_dimensions(self, input_name: str) -> Union[int, None]: + def get_dimensions(self, input_name: str) -> int | None: graph_input = self.model.find_graph_input(input_name) if graph_input: return self.get_dimensions_from_tensor_proto(graph_input) @@ -43,8 +42,8 @@ def get_dimensions(self, input_name: str) -> Union[int, None]: def fuse( self, node: NodeProto, - input_name_to_nodes: Dict[str, List[NodeProto]], - output_name_to_node: Dict[str, NodeProto], + input_name_to_nodes: dict[str, list[NodeProto]], + output_name_to_node: dict[str, NodeProto], ): """ This pattern is from PyTorch bert model diff --git a/onnxruntime/python/tools/transformers/fusion_group_norm.py b/onnxruntime/python/tools/transformers/fusion_group_norm.py index c9bf52234d696..2efec3e6ac6e8 100644 --- a/onnxruntime/python/tools/transformers/fusion_group_norm.py +++ b/onnxruntime/python/tools/transformers/fusion_group_norm.py @@ -3,7 +3,6 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- from logging import getLogger -from typing import Dict import numpy as np from fusion_base import Fusion @@ -18,7 +17,7 @@ def __init__(self, model: OnnxModel, channels_last=True): super().__init__(model, "GroupNorm", "Add") self.channels_last = channels_last - def fuse(self, add_node, input_name_to_nodes: Dict, output_name_to_node: Dict): + def fuse(self, add_node, input_name_to_nodes: dict, output_name_to_node: dict): """ Fuse Group Normalization subgraph into one node GroupNorm. The following is the pattern with swish activation: diff --git a/onnxruntime/python/tools/transformers/fusion_layernorm.py b/onnxruntime/python/tools/transformers/fusion_layernorm.py index 277bd0799cf16..1c96c54d9de35 100644 --- a/onnxruntime/python/tools/transformers/fusion_layernorm.py +++ b/onnxruntime/python/tools/transformers/fusion_layernorm.py @@ -3,7 +3,6 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- from logging import getLogger -from typing import Dict, List from fusion_base import Fusion from onnx import TensorProto, helper @@ -18,7 +17,7 @@ def __init__(self, model: OnnxModel, check_constant_and_dimension: bool = True, self.check_constant_and_dimension = check_constant_and_dimension self.force = force - def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): + def fuse(self, node, input_name_to_nodes: dict, output_name_to_node: dict): """ Fuse Layer Normalization subgraph into one node LayerNormalization: +----------------------+ @@ -184,7 +183,7 @@ def get_weight_or_bias(self, output_name, description): return value.reshape([value.shape[0]]) - def create_transpose_node(self, input_name: str, perm: List[int], output_name=None): + def create_transpose_node(self, input_name: str, perm: list[int], output_name=None): """Append a Transpose node after an input""" node_name = self.model.create_node_name("Transpose") @@ -196,7 +195,7 @@ def create_transpose_node(self, input_name: str, perm: List[int], output_name=No return transpose_node - def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): + def fuse(self, node, input_name_to_nodes: dict, output_name_to_node: dict): """ Fuse Layer Normalization subgraph into one node LayerNormalization: +----------------------+ @@ -328,7 +327,7 @@ class FusionLayerNormalizationTF(Fusion): def __init__(self, model: OnnxModel): super().__init__(model, "LayerNormalization", "Add", "TF") - def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): + def fuse(self, node, input_name_to_nodes: dict, output_name_to_node: dict): """ Layer Norm from Tensorflow model(using keras2onnx or tf2onnx): +------------------------------------+ diff --git a/onnxruntime/python/tools/transformers/fusion_mha_mmdit.py b/onnxruntime/python/tools/transformers/fusion_mha_mmdit.py index dcad55c13eb49..48f6f9a9686ee 100644 --- a/onnxruntime/python/tools/transformers/fusion_mha_mmdit.py +++ b/onnxruntime/python/tools/transformers/fusion_mha_mmdit.py @@ -3,7 +3,6 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- from logging import getLogger -from typing import Dict, Optional import numpy as np from fusion_base import Fusion @@ -131,7 +130,7 @@ def reshape_to_3d(self, input_name: str, output_name: str) -> str: self.node_name_to_graph_name[reshape_q.name] = self.this_graph_name return reshape_q.output[0] - def adjust_query_from_bnsh_to_bsd_no_concat(self, mul_q: NodeProto, output_name_to_node) -> Optional[str]: + def adjust_query_from_bnsh_to_bsd_no_concat(self, mul_q: NodeProto, output_name_to_node) -> str | None: """ MultiHeadAttenion requires query in BSD format. This function adjusts query from BNSH to BSD format. @@ -179,7 +178,7 @@ def adjust_query_from_bnsh_to_bsd_no_concat(self, mul_q: NodeProto, output_name_ return self.reshape_to_3d(sln_a.output[0], sln_output + "_BSD") - def adjust_query_from_bnsh_to_bsd(self, mul_q: NodeProto, output_name_to_node) -> Optional[str]: + def adjust_query_from_bnsh_to_bsd(self, mul_q: NodeProto, output_name_to_node) -> str | None: """ MultiHeadAttenion requires query in BSD format. This function adjusts query from BNSH to BSD format. @@ -294,7 +293,7 @@ def update_unsqueeze_axes_1_to_2(self, unsqueeze: NodeProto) -> str: return updated_unsqueeze_output - def update_unsqueeze_axes(self, add: NodeProto, output_name_to_node: Dict[str, NodeProto]) -> bool: + def update_unsqueeze_axes(self, add: NodeProto, output_name_to_node: dict[str, NodeProto]) -> bool: """ Update axes of Unsqueeze from [1] to [2] in the following pattern: Unsqueeze Unsqueeze @@ -347,7 +346,7 @@ def update_unsqueeze_axes(self, add: NodeProto, output_name_to_node: Dict[str, N nodes_b[0].input[1] = self.update_unsqueeze_axes_1_to_2(nodes_b[1]) return True - def adjust_flux_query_from_bnsh_to_bsd(self, mul_q: NodeProto, output_name_to_node) -> Optional[str]: + def adjust_flux_query_from_bnsh_to_bsd(self, mul_q: NodeProto, output_name_to_node) -> str | None: """ Adjust graph to change query format from BNSH to BSD for Flux model. Note that the graph pattern is complex, and we only do a shallow match here. @@ -431,7 +430,7 @@ def adjust_flux_query_from_bnsh_to_bsd(self, mul_q: NodeProto, output_name_to_no return self.reshape_to_3d(add.output[0], add.output[0] + "_BSD") - def adjust_flux_single_query_from_bnsh_to_bsd(self, mul_q: NodeProto, output_name_to_node) -> Optional[str]: + def adjust_flux_single_query_from_bnsh_to_bsd(self, mul_q: NodeProto, output_name_to_node) -> str | None: """ Adjust graph to change query format from BNSH to BSD for Flux model. Note that the graph pattern is complex, and we only do a shallow match here. @@ -482,7 +481,7 @@ def adjust_flux_single_query_from_bnsh_to_bsd(self, mul_q: NodeProto, output_nam return self.reshape_to_3d(add.output[0], add.output[0] + "_BSD") - def transpose_reshape_bnsh_to_bsd(self, q: str, output_name_to_node) -> Optional[str]: + def transpose_reshape_bnsh_to_bsd(self, q: str, output_name_to_node) -> str | None: transpose_q = helper.make_node( "Transpose", [q], diff --git a/onnxruntime/python/tools/transformers/fusion_nhwc_conv.py b/onnxruntime/python/tools/transformers/fusion_nhwc_conv.py index 5233fdf272fbd..0ad50a270caf7 100644 --- a/onnxruntime/python/tools/transformers/fusion_nhwc_conv.py +++ b/onnxruntime/python/tools/transformers/fusion_nhwc_conv.py @@ -4,7 +4,6 @@ # -------------------------------------------------------------------------- from logging import getLogger -from typing import List from fusion_base import Fusion from fusion_utils import FusionUtils @@ -22,7 +21,7 @@ def __init__(self, model: OnnxModel, update_weight=False): self.update_weight = update_weight self.fusion_utils = FusionUtils(model) - def create_transpose_node(self, input_name: str, perm: List[int], output_name=None): + def create_transpose_node(self, input_name: str, perm: list[int], output_name=None): """Append a Transpose node after an input""" node_name = self.model.create_node_name("Transpose") diff --git a/onnxruntime/python/tools/transformers/fusion_qordered_attention.py b/onnxruntime/python/tools/transformers/fusion_qordered_attention.py index fb020298bc210..52ccfc6fe368d 100644 --- a/onnxruntime/python/tools/transformers/fusion_qordered_attention.py +++ b/onnxruntime/python/tools/transformers/fusion_qordered_attention.py @@ -4,7 +4,6 @@ # -------------------------------------------------------------------------- from logging import getLogger -from typing import Tuple import numpy as np from fusion_attention import AttentionMask @@ -30,7 +29,7 @@ def __init__( super().__init__(model, "QOrderedAttention", "QOrderedLayerNormalization") - def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]: + def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> tuple[int, int]: """Detect num_heads and hidden_size from a reshape node. Args: reshape_q (NodeProto): reshape node for Q diff --git a/onnxruntime/python/tools/transformers/fusion_qordered_gelu.py b/onnxruntime/python/tools/transformers/fusion_qordered_gelu.py index 5f395b364eb6f..6a6b52a988c00 100644 --- a/onnxruntime/python/tools/transformers/fusion_qordered_gelu.py +++ b/onnxruntime/python/tools/transformers/fusion_qordered_gelu.py @@ -4,7 +4,6 @@ # -------------------------------------------------------------------------- from logging import getLogger -from typing import Dict from fusion_base import Fusion from fusion_utils import FusionUtils @@ -18,7 +17,7 @@ class FusionQOrderedGelu(Fusion): def __init__(self, model: OnnxModel): super().__init__(model, "QOrderedGelu", ["Gelu", "FastGelu"]) - def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): + def fuse(self, node, input_name_to_nodes: dict, output_name_to_node: dict): """ INPUT PATTERN Fuse (quantized) Gelu subgraph into one node QOrderedGelu: diff --git a/onnxruntime/python/tools/transformers/fusion_qordered_layernorm.py b/onnxruntime/python/tools/transformers/fusion_qordered_layernorm.py index 5ec6dadc1e677..c8b1be71d4616 100644 --- a/onnxruntime/python/tools/transformers/fusion_qordered_layernorm.py +++ b/onnxruntime/python/tools/transformers/fusion_qordered_layernorm.py @@ -3,7 +3,6 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- from logging import getLogger -from typing import Dict from fusion_base import Fusion from fusion_utils import FusionUtils @@ -17,7 +16,7 @@ class FusionQOrderedLayerNormalization(Fusion): def __init__(self, model: OnnxModel): super().__init__(model, "QOrderedLayerNormalization", "LayerNormalization") - def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): + def fuse(self, node, input_name_to_nodes: dict, output_name_to_node: dict): """ Fuse (quantized) Layer Normalization subgraph into one node QOrderedLayerNormalization: quantized input -> DQ diff --git a/onnxruntime/python/tools/transformers/fusion_qordered_matmul.py b/onnxruntime/python/tools/transformers/fusion_qordered_matmul.py index 681160479faef..3a373f3fd4d78 100644 --- a/onnxruntime/python/tools/transformers/fusion_qordered_matmul.py +++ b/onnxruntime/python/tools/transformers/fusion_qordered_matmul.py @@ -4,7 +4,6 @@ # -------------------------------------------------------------------------- from logging import getLogger -from typing import Dict from fusion_base import Fusion from fusion_utils import FusionUtils @@ -18,7 +17,7 @@ class FusionQOrderedMatMul(Fusion): def __init__(self, model: OnnxModel): super().__init__(model, "QOrderedMatMul", "MatMul") - def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): + def fuse(self, node, input_name_to_nodes: dict, output_name_to_node: dict): matmul_children = self.model.get_children(node, input_name_to_nodes) # Should only have 1 child - Bias Add diff --git a/onnxruntime/python/tools/transformers/fusion_rotary_attention.py b/onnxruntime/python/tools/transformers/fusion_rotary_attention.py index efdcbcfb3dcdc..6657fde2257e5 100644 --- a/onnxruntime/python/tools/transformers/fusion_rotary_attention.py +++ b/onnxruntime/python/tools/transformers/fusion_rotary_attention.py @@ -3,7 +3,6 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- import logging -from typing import Optional, Union from fusion_attention import FusionAttention from fusion_base import Fusion @@ -51,8 +50,8 @@ def create_mha_node( past_v: str = "", present_k: str = "", present_v: str = "", - scale: Optional[float] = None, - ) -> Union[NodeProto, None]: + scale: float | None = None, + ) -> NodeProto | None: assert self.num_heads > 0 if self.hidden_size > 0 and (self.hidden_size % self.num_heads) != 0: @@ -1131,7 +1130,7 @@ def reassign_extra_outputs(self, rot_emb_node: NodeProto, function: FunctionProt extra_initializers.append(constant_tensorproto.name) # Update references of Constant node outputs to initializer references - for extra_output, extra_initializer in zip(extra_outputs, extra_initializers): + for extra_output, extra_initializer in zip(extra_outputs, extra_initializers, strict=False): nodes_to_update = list(filter(lambda entry: extra_output in entry.input, self.model.model.graph.node)) for node_to_update in nodes_to_update: OnnxModel.replace_node_input(node_to_update, extra_output, extra_initializer) diff --git a/onnxruntime/python/tools/transformers/fusion_shape.py b/onnxruntime/python/tools/transformers/fusion_shape.py index dfa77fc7d0221..18a8fda6a67b1 100644 --- a/onnxruntime/python/tools/transformers/fusion_shape.py +++ b/onnxruntime/python/tools/transformers/fusion_shape.py @@ -4,7 +4,6 @@ # -------------------------------------------------------------------------- from logging import getLogger -from typing import Dict, List, Union from fusion_base import Fusion from fusion_utils import FusionUtils @@ -22,13 +21,13 @@ def __init__(self, model: OnnxModel): self.shape_infer = None self.shape_infer_done = False - def get_dimensions_from_tensor_proto(self, tensor_proto: TensorProto) -> Union[int, None]: + def get_dimensions_from_tensor_proto(self, tensor_proto: TensorProto) -> int | None: if tensor_proto.type.tensor_type.HasField("shape"): return len(tensor_proto.type.tensor_type.shape.dim) else: return None - def get_dimensions(self, input_name: str) -> Union[int, None]: + def get_dimensions(self, input_name: str) -> int | None: shape = self.model.get_shape(input_name) if shape is not None: return len(shape) @@ -45,8 +44,8 @@ def get_dimensions(self, input_name: str) -> Union[int, None]: def fuse( self, concat_node: NodeProto, - input_name_to_nodes: Dict[str, List[NodeProto]], - output_name_to_node: Dict[str, NodeProto], + input_name_to_nodes: dict[str, list[NodeProto]], + output_name_to_node: dict[str, NodeProto], ): # # Simplify subgraph like diff --git a/onnxruntime/python/tools/transformers/fusion_simplified_layernorm.py b/onnxruntime/python/tools/transformers/fusion_simplified_layernorm.py index ca7ff6462b9ff..a0eff081675fe 100644 --- a/onnxruntime/python/tools/transformers/fusion_simplified_layernorm.py +++ b/onnxruntime/python/tools/transformers/fusion_simplified_layernorm.py @@ -1,5 +1,4 @@ import logging -from typing import Dict from fusion_base import Fusion from fusion_skiplayernorm import FusionSkipLayerNormalization @@ -13,7 +12,7 @@ class FusionSimplifiedLayerNormalization(Fusion): def __init__(self, model: OnnxModel): super().__init__(model, "SimplifiedLayerNormalization", "Mul") - def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict): + def fuse(self, node, input_name_to_nodes: dict, output_name_to_node: dict): if node.op_type != "Mul": return diff --git a/onnxruntime/python/tools/transformers/fusion_skip_group_norm.py b/onnxruntime/python/tools/transformers/fusion_skip_group_norm.py index 676052f747967..b2b3af38253c2 100644 --- a/onnxruntime/python/tools/transformers/fusion_skip_group_norm.py +++ b/onnxruntime/python/tools/transformers/fusion_skip_group_norm.py @@ -3,7 +3,6 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- from logging import getLogger -from typing import List from fusion_base import Fusion from fusion_utils import NumpyHelper @@ -26,7 +25,7 @@ def __init__(self, model: OnnxModel): if self.shape_infer_helper is None: logger.warning("SkipGroupNorm fusion will be skipped since symbolic shape inference disabled or failed.") - def create_transpose_node(self, input_name: str, perm: List[int], output_name=None): + def create_transpose_node(self, input_name: str, perm: list[int], output_name=None): """Append a Transpose node after an input""" node_name = self.model.create_node_name("Transpose") if output_name is None: diff --git a/onnxruntime/python/tools/transformers/fusion_transpose.py b/onnxruntime/python/tools/transformers/fusion_transpose.py index ca699903a7cd9..d38fcffb2af0d 100644 --- a/onnxruntime/python/tools/transformers/fusion_transpose.py +++ b/onnxruntime/python/tools/transformers/fusion_transpose.py @@ -4,7 +4,6 @@ # -------------------------------------------------------------------------- from logging import getLogger -from typing import Dict, List from fusion_base import Fusion from fusion_utils import FusionUtils @@ -21,8 +20,8 @@ def __init__(self, model: OnnxModel): def fuse( self, transpose_node: NodeProto, - input_name_to_nodes: Dict[str, List[NodeProto]], - output_name_to_node: Dict[str, NodeProto], + input_name_to_nodes: dict[str, list[NodeProto]], + output_name_to_node: dict[str, NodeProto], ): """ Note that onnxruntime will do comprehensive transpose optimization after loading model. @@ -90,7 +89,7 @@ class FusionInsertTranspose(Fusion): def __init__(self, model: OnnxModel): super().__init__(model, "", "GroupNorm") - def create_transpose_node(self, input_name: str, perm: List[int], output_name=None): + def create_transpose_node(self, input_name: str, perm: list[int], output_name=None): """Append a Transpose node after an input""" node_name = self.model.create_node_name("Transpose") if output_name is None: @@ -102,8 +101,8 @@ def create_transpose_node(self, input_name: str, perm: List[int], output_name=No def fuse( self, group_norm_node: NodeProto, - input_name_to_nodes: Dict[str, List[NodeProto]], - output_name_to_node: Dict[str, NodeProto], + input_name_to_nodes: dict[str, list[NodeProto]], + output_name_to_node: dict[str, NodeProto], ): """ This optimization will insert an Transpose, and onnxruntime transpose optimizer will remove it together with diff --git a/onnxruntime/python/tools/transformers/fusion_utils.py b/onnxruntime/python/tools/transformers/fusion_utils.py index 3084b84278994..5343c77adb97a 100644 --- a/onnxruntime/python/tools/transformers/fusion_utils.py +++ b/onnxruntime/python/tools/transformers/fusion_utils.py @@ -3,7 +3,6 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- from logging import getLogger -from typing import Optional, Tuple import numpy from numpy import array_equal, ndarray @@ -18,7 +17,7 @@ class FusionUtils: def __init__(self, model: OnnxModel): self.model: OnnxModel = model - def cast_graph_input_to_int32(self, input_name: str) -> Tuple[bool, str]: + def cast_graph_input_to_int32(self, input_name: str) -> tuple[bool, str]: graph_input = self.model.find_graph_input(input_name) if graph_input is not None and graph_input.type.tensor_type.elem_type != TensorProto.INT32: cast_output, cast_node = self.cast_input_to_int32(input_name) @@ -48,9 +47,9 @@ def add_cast_node( self, input_name: str, to_type: int, - output_name: Optional[str] = None, + output_name: str | None = None, output_name_to_node=None, - graph_name: Optional[str] = None, + graph_name: str | None = None, ): if output_name is None: output_name = input_name + f"_cast_to_{to_type}" @@ -127,7 +126,7 @@ def skip_parent(model: OnnxModel, node, parent_node, input_name_to_nodes, node_i return parent_can_be_removed - def get_squeeze_or_unsqueeze_axes(self, node: NodeProto) -> Optional[ndarray]: + def get_squeeze_or_unsqueeze_axes(self, node: NodeProto) -> ndarray | None: assert node.op_type in ["Squeeze", "Unsqueeze"] # For opset >= 13, axes is an input instead of an attribute. diff --git a/onnxruntime/python/tools/transformers/io_binding_helper.py b/onnxruntime/python/tools/transformers/io_binding_helper.py index 0fa038d5cfc62..5870a031086ee 100644 --- a/onnxruntime/python/tools/transformers/io_binding_helper.py +++ b/onnxruntime/python/tools/transformers/io_binding_helper.py @@ -1,7 +1,8 @@ import copy import logging from collections import OrderedDict -from typing import Any, Dict, List, Mapping, Optional, Tuple, Union +from collections.abc import Mapping +from typing import Any import numpy import torch @@ -9,7 +10,7 @@ from onnxruntime import InferenceSession, RunOptions # Type alias -ShapeDict = Mapping[str, Union[Tuple, List[int]]] +ShapeDict = Mapping[str, tuple | list[int]] logger = logging.getLogger(__name__) @@ -88,7 +89,7 @@ def torch_type_to_numpy_type(torch_type: torch.dtype): return torch_type_to_numpy_type_map[torch_type] @staticmethod - def get_io_numpy_type_map(ort_session: InferenceSession) -> Dict[str, numpy.dtype]: + def get_io_numpy_type_map(ort_session: InferenceSession) -> dict[str, numpy.dtype]: """Create a mapping from input/output name to numpy data type""" name_to_numpy_type = {} for input in ort_session.get_inputs(): @@ -116,7 +117,7 @@ def prepare_io_binding( input_ids: torch.Tensor, position_ids: torch.Tensor, attention_mask: torch.Tensor, - past: List[torch.Tensor], + past: list[torch.Tensor], output_buffers, output_shapes, name_to_np_type=None, @@ -228,7 +229,7 @@ def __init__(self, ort_session: InferenceSession, device: torch.device, enable_c self.device = device # Pairs of input and output names that share the same buffer. - self.buffer_sharing: Dict[str, str] = {} + self.buffer_sharing: dict[str, str] = {} def set_buffer_sharing(self, input_name: str, output_name: str): assert input_name in self.input_names @@ -307,7 +308,7 @@ def allocate_buffers(self, shape_dict: ShapeDict): tensor.data_ptr(), ) - def infer(self, feed_dict: Dict[str, torch.Tensor], run_options: RunOptions = None, synchronize: bool = True): + def infer(self, feed_dict: dict[str, torch.Tensor], run_options: RunOptions = None, synchronize: bool = True): """Bind input tensors and run inference""" for name, tensor in feed_dict.items(): assert isinstance(tensor, torch.Tensor) and tensor.is_contiguous() @@ -330,7 +331,7 @@ def infer(self, feed_dict: Dict[str, torch.Tensor], run_options: RunOptions = No return self.output_tensors @staticmethod - def get_cuda_provider_options(device_id: int, enable_cuda_graph: bool, stream: int = 0) -> Dict[str, Any]: + def get_cuda_provider_options(device_id: int, enable_cuda_graph: bool, stream: int = 0) -> dict[str, Any]: options = { "device_id": device_id, "arena_extend_strategy": "kSameAsRequested", @@ -353,7 +354,7 @@ def __init__( enable_gpu_graph: bool = False, gpu_graph_id: int = -1, stream: int = 0, - buffer_sharing: Optional[Dict[str, str]] = None, + buffer_sharing: dict[str, str] | None = None, ): super().__init__(ort_session, device, enable_gpu_graph) if buffer_sharing: @@ -379,7 +380,7 @@ def get_run_options(self, disable_cuda_graph_in_run: bool = False) -> RunOptions return options - def infer(self, feed_dict: Dict[str, torch.Tensor], disable_cuda_graph_in_run: bool = False): + def infer(self, feed_dict: dict[str, torch.Tensor], disable_cuda_graph_in_run: bool = False): run_options = self.get_run_options(disable_cuda_graph_in_run) if self.stream: @@ -411,7 +412,7 @@ def get_binding( self, shape_dict: ShapeDict, use_cuda_graph: bool = False, - buffer_sharing: Optional[Dict[str, str]] = None, + buffer_sharing: dict[str, str] | None = None, ) -> GpuBinding: for gpu_graph_binding in self.graph_bindings: # Found a cuda graph that captured with the same shape diff --git a/onnxruntime/python/tools/transformers/large_model_exporter.py b/onnxruntime/python/tools/transformers/large_model_exporter.py index f623102802a67..29829a6c475d9 100644 --- a/onnxruntime/python/tools/transformers/large_model_exporter.py +++ b/onnxruntime/python/tools/transformers/large_model_exporter.py @@ -13,7 +13,6 @@ import os import tempfile from pathlib import Path -from typing import Optional import onnx import torch @@ -50,7 +49,7 @@ def get_model_parameter_size(model: nn.Module): return all_size -def initialize_model_and_sample_inputs(hf_model: str, cache_dir: Optional[str], tokenizer=None): +def initialize_model_and_sample_inputs(hf_model: str, cache_dir: str | None, tokenizer=None): """ get the pretrained torch model from hugginface, and sample model-inputs @@ -155,7 +154,7 @@ def hook_for_inputs(_, inputs, kwargs): for key, value in user_inputs[1].items(): idx = input_keys.index(key) onnx_inputs[idx] = value - for idx, (key, value) in enumerate(zip(input_keys, onnx_inputs)): + for idx, (key, value) in enumerate(zip(input_keys, onnx_inputs, strict=False)): if type(value) is torch.Tensor: value.to(model.device) if "use_cache" in key: @@ -309,7 +308,7 @@ def do_export_internal(model: nn.Module, onnx_io_tuple: tuple, onnx_inputs: tupl @torch.no_grad() -def export_onnx(hf_model: str, cache_dir: Optional[str], onnx_path_str: str, with_past: bool, opset: int): +def export_onnx(hf_model: str, cache_dir: str | None, onnx_path_str: str, with_past: bool, opset: int): """ do export model: torch model diff --git a/onnxruntime/python/tools/transformers/machine_info.py b/onnxruntime/python/tools/transformers/machine_info.py index d4194abbd14d3..7f9a0110bcd9f 100644 --- a/onnxruntime/python/tools/transformers/machine_info.py +++ b/onnxruntime/python/tools/transformers/machine_info.py @@ -10,7 +10,6 @@ import logging import platform from os import environ -from typing import Dict, List import cpuinfo import psutil @@ -66,12 +65,12 @@ def get_machine_info(self): } return machine_info - def get_memory_info(self) -> Dict: + def get_memory_info(self) -> dict: """Get memory info""" mem = psutil.virtual_memory() return {"total": mem.total, "available": mem.available} - def _try_get(self, cpu_info: Dict, names: List) -> str: + def _try_get(self, cpu_info: dict, names: list) -> str: for name in names: if name in cpu_info: value = cpu_info[name] @@ -80,7 +79,7 @@ def _try_get(self, cpu_info: Dict, names: List) -> str: return value return "" - def get_cpu_info(self) -> Dict: + def get_cpu_info(self) -> dict: """Get CPU info""" cpu_info = cpuinfo.get_cpu_info() @@ -94,7 +93,7 @@ def get_cpu_info(self) -> Dict: "processor": platform.uname().processor, } - def get_gpu_info_by_nvml(self) -> Dict: + def get_gpu_info_by_nvml(self) -> dict: """Get GPU info using nvml""" gpu_info_list = [] driver_version = None @@ -122,7 +121,7 @@ def get_gpu_info_by_nvml(self) -> Dict: result["cuda_visible"] = environ["CUDA_VISIBLE_DEVICES"] return result - def get_related_packages(self) -> List[str]: + def get_related_packages(self) -> list[str]: import pkg_resources installed_packages = pkg_resources.working_set @@ -142,7 +141,7 @@ def get_related_packages(self) -> List[str]: related_packages_list = {i.key: i.version for i in installed_packages if i.key in related_packages} return related_packages_list - def get_onnxruntime_info(self) -> Dict: + def get_onnxruntime_info(self) -> dict: try: import onnxruntime @@ -159,7 +158,7 @@ def get_onnxruntime_info(self) -> Dict: self.logger.exception(exception, False) return None - def get_pytorch_info(self) -> Dict: + def get_pytorch_info(self) -> dict: try: import torch @@ -177,7 +176,7 @@ def get_pytorch_info(self) -> Dict: self.logger.exception(exception, False) return None - def get_tensorflow_info(self) -> Dict: + def get_tensorflow_info(self) -> dict: try: import tensorflow as tf diff --git a/onnxruntime/python/tools/transformers/metrics.py b/onnxruntime/python/tools/transformers/metrics.py index 282c75ba8f6a5..74a34df28c019 100644 --- a/onnxruntime/python/tools/transformers/metrics.py +++ b/onnxruntime/python/tools/transformers/metrics.py @@ -6,7 +6,6 @@ import datetime import json -from typing import Optional import pandas as pd @@ -30,10 +29,10 @@ def to_dict(self): class ModelInfo(BaseObject): def __init__( self, - full_name: Optional[str] = None, - is_huggingface: Optional[bool] = False, - is_text_generation: Optional[bool] = False, - short_name: Optional[str] = None, + full_name: str | None = None, + is_huggingface: bool | None = False, + is_text_generation: bool | None = False, + short_name: str | None = None, ): super().__init__() self.full_name = full_name @@ -46,9 +45,9 @@ def __init__( class BackendOptions(BaseObject): def __init__( self, - enable_profiling: Optional[bool] = False, - execution_provider: Optional[str] = None, - use_io_binding: Optional[bool] = False, + enable_profiling: bool | None = False, + execution_provider: str | None = None, + use_io_binding: bool | None = False, ): super().__init__() self.enable_profiling = enable_profiling @@ -59,12 +58,12 @@ def __init__( class Config(BaseObject): def __init__( self, - backend: Optional[str] = "onnxruntime", - batch_size: Optional[int] = 1, - seq_length: Optional[int] = 0, - precision: Optional[str] = "fp32", - warmup_runs: Optional[int] = 1, - measured_runs: Optional[int] = 10, + backend: str | None = "onnxruntime", + batch_size: int | None = 1, + seq_length: int | None = 0, + precision: str | None = "fp32", + warmup_runs: int | None = 1, + measured_runs: int | None = 10, ): super().__init__() self.backend = backend @@ -80,11 +79,11 @@ def __init__( class Metadata(BaseObject): def __init__( self, - device: Optional[str] = None, - package_name: Optional[str] = None, - package_version: Optional[str] = None, - platform: Optional[str] = None, - python_version: Optional[str] = None, + device: str | None = None, + package_name: str | None = None, + package_version: str | None = None, + platform: str | None = None, + python_version: str | None = None, ): super().__init__() self.device = device @@ -97,9 +96,9 @@ def __init__( class Metrics(BaseObject): def __init__( self, - latency_ms_mean: Optional[float] = 0.0, - throughput_qps: Optional[float] = 0.0, - max_memory_usage_GB: Optional[float] = 0.0, + latency_ms_mean: float | None = 0.0, + throughput_qps: float | None = 0.0, + max_memory_usage_GB: float | None = 0.0, ): super().__init__() self.latency_ms_mean = latency_ms_mean @@ -116,10 +115,10 @@ def __init__( device: str, package_name: str, package_version: str, - batch_size: Optional[int] = 1, - warmup_runs: Optional[int] = 1, - measured_runs: Optional[int] = 10, - trigger_date: Optional[str] = None, + batch_size: int | None = 1, + warmup_runs: int | None = 1, + measured_runs: int | None = 10, + trigger_date: str | None = None, ): self.config = Config() self.metrics = Metrics() diff --git a/onnxruntime/python/tools/transformers/models/bart/utils/export_helper.py b/onnxruntime/python/tools/transformers/models/bart/utils/export_helper.py index 8b7c18dbde7d9..85d2fa9a64e23 100644 --- a/onnxruntime/python/tools/transformers/models/bart/utils/export_helper.py +++ b/onnxruntime/python/tools/transformers/models/bart/utils/export_helper.py @@ -4,13 +4,12 @@ # license information. # -------------------------------------------------------------------------- -from typing import List, Tuple import torch from transformers import BartConfig, BartForConditionalGeneration, BartTokenizer -def group_by_self_and_cross(present_key_values: Tuple[torch.Tensor], concat: bool = False): +def group_by_self_and_cross(present_key_values: tuple[torch.Tensor], concat: bool = False): """Categorize present_key_values into self and cross attention. Split present state from grouped by layer to grouped by self/cross attention. @@ -27,8 +26,8 @@ def group_by_self_and_cross(present_key_values: Tuple[torch.Tensor], concat: boo present_self (Tuple[torch.Tensor]): present key and values from self attention present_cross (Tuple[torch.Tensor]): present key and values from cross attention """ - present_self: List[torch.Tensor] = [] - present_cross: List[torch.Tensor] = [] + present_self: list[torch.Tensor] = [] + present_cross: list[torch.Tensor] = [] for _, present_layer_i in enumerate(present_key_values): assert len(present_layer_i) == 4, f"Expected to have four items. Got {len(present_layer_i)}" present_key_self, present_value_self, present_key_cross, present_value_cross = present_layer_i @@ -40,7 +39,7 @@ def group_by_self_and_cross(present_key_values: Tuple[torch.Tensor], concat: boo return present_self, present_cross -def back_group_by_layer(past_key_values: Tuple[Tuple[torch.Tensor]]): +def back_group_by_layer(past_key_values: tuple[tuple[torch.Tensor]]): """Categorize present_key_values from self and cross attention to layer by layer. Reorder past state from grouped by self/cross attention to grouped by layer. @@ -70,7 +69,7 @@ def back_group_by_layer(past_key_values: Tuple[Tuple[torch.Tensor]]): return past_tuples -def get_input_names(past_key_values: Tuple[Tuple[torch.Tensor]], encoder=True): +def get_input_names(past_key_values: tuple[tuple[torch.Tensor]], encoder=True): """Process input names of model wrapper. Args: @@ -89,7 +88,7 @@ def get_input_names(past_key_values: Tuple[Tuple[torch.Tensor]], encoder=True): return names -def get_output_names(past_key_values: Tuple[torch.Tensor]): +def get_output_names(past_key_values: tuple[torch.Tensor]): """Process output names of model wrapper. As cross attention is unchanged during every iteration of beam search, diff --git a/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_edinit.py b/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_edinit.py index 8a610fb17671b..f8d13ca041349 100644 --- a/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_edinit.py +++ b/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_edinit.py @@ -6,7 +6,7 @@ import os import time -from typing import Any, Dict, Optional +from typing import Any import torch from transformers import BartConfig, BartForConditionalGeneration, file_utils @@ -87,8 +87,8 @@ def _create_encoder_export(args, config: BartConfig): """ def _prepare_encoder_decoder_kwargs_for_generation( - self, input_ids: torch.Tensor, model_kwargs, model_input_name: Optional[str] = None - ) -> Dict[str, Any]: + self, input_ids: torch.Tensor, model_kwargs, model_input_name: str | None = None + ) -> dict[str, Any]: # retrieve encoder hidden states # 1. get encoder encoder = self.get_encoder() diff --git a/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_enc_dec_past.py b/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_enc_dec_past.py index afd01ae9d025f..475e4c5aecd18 100644 --- a/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_enc_dec_past.py +++ b/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_enc_dec_past.py @@ -208,7 +208,7 @@ def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwarg # Test the generated model with onnxruntime print("========== ORT inference test on Decoder ... ==========") - ort_inputs = {name: value.cpu().numpy() for name, value in zip(input_names, inputs)} + ort_inputs = {name: value.cpu().numpy() for name, value in zip(input_names, inputs, strict=False)} # NOTE: encoder_hidden_states is not used and deleted ort_inputs.pop("encoder_hidden_states") sess_options = SessionOptions() @@ -216,7 +216,7 @@ def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwarg sess = InferenceSession(onnx_model_path, sess_options, providers=["CPUExecutionProvider"]) out = sess.run(None, ort_inputs) - for ort_out, torch_out in zip(out, [logits, *present]): + for ort_out, torch_out in zip(out, [logits, *present], strict=False): torch.testing.assert_close(ort_out, torch_out.cpu().numpy(), check_dtype=True, atol=1e-4, rtol=1e-2) print("========== [SUCCESS] ORT inference test on Decoder ==========") diff --git a/onnxruntime/python/tools/transformers/models/bert/eval_squad.py b/onnxruntime/python/tools/transformers/models/bert/eval_squad.py index 8797fd9c2cfaf..680b3455ade2d 100644 --- a/onnxruntime/python/tools/transformers/models/bert/eval_squad.py +++ b/onnxruntime/python/tools/transformers/models/bert/eval_squad.py @@ -33,7 +33,7 @@ from importlib_metadata import PackageNotFoundError, version from pathlib import Path -from typing import Any, Dict, List, Optional +from typing import Any from datasets import load_dataset from evaluate import evaluator @@ -60,7 +60,7 @@ def get_package_version(package_name: str): def load_onnx_model( - model_id: str, onnx_path: Optional[str] = None, provider="CUDAExecutionProvider", use_io_binding: bool = False + model_id: str, onnx_path: str | None = None, provider="CUDAExecutionProvider", use_io_binding: bool = False ): """Load onnx model given pretrained model name and optional ONNX model path. If onnx_path is None, the default onnx model from optimum will be used. @@ -95,7 +95,7 @@ def load_onnx_model( return model, onnx_path -def output_details(results: List[Dict[str, Any]], csv_filename: str): +def output_details(results: list[dict[str, Any]], csv_filename: str): """Output a CSV file with detail of each test results. Args: @@ -136,7 +136,7 @@ def output_details(results: List[Dict[str, Any]], csv_filename: str): print(f"Detail results are saved to csv file: {csv_filename}") -def output_summary(results: List[Dict[str, Any]], csv_filename: str, metric_name: str): +def output_summary(results: list[dict[str, Any]], csv_filename: str, metric_name: str): """Output a CSV file with summary of a metric on combinations of batch_size and sequence_length. Args: diff --git a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py index 1b12fe9005175..b405c19b04689 100644 --- a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py +++ b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py @@ -12,7 +12,6 @@ import tempfile import time from pathlib import Path -from typing import Dict, List, Tuple, Union import numpy import onnx @@ -139,17 +138,17 @@ class Gpt2Inputs: def __init__(self, input_ids, position_ids, attention_mask, past): self.input_ids: torch.LongTensor = input_ids self.position_ids: torch.LongTensor = position_ids - self.attention_mask: Union[torch.LongTensor, torch.FloatTensor, torch.HalfTensor] = attention_mask - self.past: Union[List[torch.FloatTensor], List[torch.HalfTensor]] = past + self.attention_mask: torch.LongTensor | torch.FloatTensor | torch.HalfTensor = attention_mask + self.past: list[torch.FloatTensor] | list[torch.HalfTensor] = past - def to_list(self) -> List: + def to_list(self) -> list: input_list = [v for v in [self.input_ids, self.position_ids, self.attention_mask] if v is not None] if self.past: input_list.extend(self.past) return input_list - def to_tuple(self) -> Tuple: + def to_tuple(self) -> tuple: return tuple(v for v in [self.input_ids, self.position_ids, self.attention_mask, self.past] if v is not None) def to_fp32(self): @@ -241,7 +240,7 @@ def get_output_shapes( sequence_length: int, config: GPT2Config, model_class: str = "GPT2LMHeadModel", - ) -> Dict[str, List[int]]: + ) -> dict[str, list[int]]: """Returns a dictionary with output name as key, and shape as value.""" num_attention_heads = config.num_attention_heads hidden_size = config.hidden_size @@ -541,7 +540,7 @@ def optimize_onnx( @staticmethod def auto_mixed_precision( onnx_model: OnnxModel, - op_block_list: List[str] = [ # noqa: B006 + op_block_list: list[str] = [ # noqa: B006 "Add", "LayerNormalization", "SkipLayerNormalization", @@ -698,8 +697,8 @@ def get_outputs_from_io_binding_buffer(ort_session, output_buffers, output_shape def onnxruntime_inference_with_binded_io( ort_session, inputs: Gpt2Inputs, - output_buffers: Dict[str, torch.Tensor], - output_shapes: Dict[str, List[int]], + output_buffers: dict[str, torch.Tensor], + output_shapes: dict[str, list[int]], total_runs: int = 0, return_numpy: bool = True, include_copy_output_latency: bool = False, diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py index 7bf8bcb82e59a..89fd613ecbbc2 100644 --- a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py +++ b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py @@ -427,7 +427,7 @@ def convert_to_float16(args: argparse.Namespace, old_paths: list[str], rank: int new_paths = [decoder_model_fp16_path, decoder_with_past_model_fp16_path, decoder_merged_model_fp16_path] logger.info("Converting to float16...") - for fp32_path, fp16_path in zip(old_paths, new_paths): + for fp32_path, fp16_path in zip(old_paths, new_paths, strict=False): if os.path.exists(fp32_path): model = OnnxModel(onnx.load_model(fp32_path, load_external_data=True)) model.convert_float_to_float16(keep_io_types=False) @@ -867,7 +867,7 @@ def main(): # Run the optimizer script. logger.info("Optimizing models...") - for orig_path, opt_path in zip(old_paths, new_paths): + for orig_path, opt_path in zip(old_paths, new_paths, strict=False): if os.path.exists(orig_path): optimize_export(args, l_config, input_path=orig_path, output_path=opt_path, world_size=world_size) @@ -912,7 +912,7 @@ def main(): ) logger.info("Quantizing to int8...") - for fp32_path, int8_path in zip(old_paths, new_paths): + for fp32_path, int8_path in zip(old_paths, new_paths, strict=False): if os.path.exists(fp32_path): ort_quantization.quantize_dynamic( fp32_path, @@ -952,7 +952,7 @@ def main(): ) new_paths = [decoder_model_int4_path, decoder_with_past_model_int4_path, decoder_merged_model_int4_path] - for fp_path, int4_path in zip(old_paths, new_paths): + for fp_path, int4_path in zip(old_paths, new_paths, strict=False): if os.path.exists(fp_path): model = onnx.load_model(fp_path, load_external_data=True) quant = MatMul4BitsQuantizer( diff --git a/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py b/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py index 274d56df3f12c..c7e0e31765a4f 100644 --- a/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py +++ b/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py @@ -41,7 +41,7 @@ import traceback from concurrent.futures import ProcessPoolExecutor from datetime import datetime -from typing import Any, Dict, List +from typing import Any import benchmark_helper import numpy as np @@ -63,7 +63,7 @@ def test_torch_latency( global_lengths, test_times, num_threads, -) -> List[Dict[str, Any]]: +) -> list[dict[str, Any]]: if num_threads > 0: torch.set_num_threads(num_threads) @@ -143,7 +143,7 @@ def test_ort_latency( use_compact_memory=False, use_half4=False, disable_parity=False, -) -> List[Dict[str, Any]]: +) -> list[dict[str, Any]]: results = [] for batch_size in batch_sizes: for sequence_length in sequence_lengths: @@ -250,7 +250,7 @@ def test_ort_memory( global_length, test_times, num_threads, -) -> Dict[str, Any]: +) -> dict[str, Any]: logger.info( f"Testing memory for model={onnx_model_path}, batch_size={batch_size}, sequence_length={sequence_length}, " f"global_length={global_length}, test_times={test_times}, num_threads={num_threads}" @@ -307,7 +307,7 @@ def find_onnx_model(model_name, onnx_dir="."): return onnx_model_path -def test_memory(args, device) -> Dict[str, Any]: +def test_memory(args, device) -> dict[str, Any]: if len(args.batch_sizes) > 1: raise RuntimeError("For memory test, only one batch_size (-b) is allowed.") if len(args.sequence_lengths) > 1: @@ -330,7 +330,7 @@ def test_memory(args, device) -> Dict[str, Any]: ) -def test_ort(args, device) -> List[Dict[str, Any]]: +def test_ort(args, device) -> list[dict[str, Any]]: model_name = args.model onnx_model_path = find_onnx_model(model_name) if not args.onnx else args.onnx @@ -385,7 +385,7 @@ def test_ort(args, device) -> List[Dict[str, Any]]: ) -def test_torch(args, device) -> List[Dict[str, Any]]: +def test_torch(args, device) -> list[dict[str, Any]]: model = load_torch_model(args.model, device) return test_torch_latency( device, @@ -399,7 +399,7 @@ def test_torch(args, device) -> List[Dict[str, Any]]: ) -def test_latency(args, device) -> List[Dict[str, Any]]: +def test_latency(args, device) -> list[dict[str, Any]]: if args.engine == "onnxruntime": return test_ort(args, device) @@ -550,7 +550,7 @@ def output_details(results, csv_filename): print(f"Detail results are saved to csv file: {csv_filename}") -def run(args) -> List[Dict[str, Any]]: +def run(args) -> list[dict[str, Any]]: torch.set_grad_enabled(False) # set random seed manually to get deterministic results @@ -565,7 +565,7 @@ def run(args) -> List[Dict[str, Any]]: return test_latency(args, device) -def launch_test(arguments) -> List[Dict[str, Any]]: +def launch_test(arguments) -> list[dict[str, Any]]: if not torch.cuda.is_available(): raise RuntimeError("Please install PyTorch with Cuda, and use a machine with GPU for testing gpu performance.") diff --git a/onnxruntime/python/tools/transformers/models/longformer/longformer_helper.py b/onnxruntime/python/tools/transformers/models/longformer/longformer_helper.py index 1794bf75b4e6f..08a2ba629fbc3 100644 --- a/onnxruntime/python/tools/transformers/models/longformer/longformer_helper.py +++ b/onnxruntime/python/tools/transformers/models/longformer/longformer_helper.py @@ -6,7 +6,6 @@ # This script helps creating dummy inputs for Longformer model. import logging -from typing import Dict, List, Tuple, Union import numpy import torch @@ -23,16 +22,16 @@ class LongformerInputs: def __init__(self, input_ids, attention_mask, global_attention_mask): self.input_ids: torch.LongTensor = input_ids - self.attention_mask: Union[torch.FloatTensor, torch.HalfTensor] = attention_mask - self.global_attention_mask: Union[torch.FloatTensor, torch.HalfTensor] = global_attention_mask + self.attention_mask: torch.FloatTensor | torch.HalfTensor = attention_mask + self.global_attention_mask: torch.FloatTensor | torch.HalfTensor = global_attention_mask - def to_list(self) -> List: + def to_list(self) -> list: return [v for v in [self.input_ids, self.attention_mask, self.global_attention_mask] if v is not None] - def to_tuple(self) -> Tuple: + def to_tuple(self) -> tuple: return tuple(v for v in self.to_list()) - def get_ort_inputs(self) -> Dict: + def get_ort_inputs(self) -> dict: return { "input_ids": numpy.ascontiguousarray(self.input_ids.cpu().numpy()), "attention_mask": numpy.ascontiguousarray(self.attention_mask.cpu().numpy()), @@ -69,7 +68,7 @@ def get_dummy_inputs( return LongformerInputs(input_ids, attention_mask, global_attention_mask) @staticmethod - def get_output_shapes(batch_size: int, sequence_length: int, hidden_size: int) -> Dict[str, List[int]]: + def get_output_shapes(batch_size: int, sequence_length: int, hidden_size: int) -> dict[str, list[int]]: """Returns a dictionary with output name as key, and shape as value.""" return { "last_state": [batch_size, sequence_length, hidden_size], diff --git a/onnxruntime/python/tools/transformers/models/sam2/benchmark_sam2.py b/onnxruntime/python/tools/transformers/models/sam2/benchmark_sam2.py index f75a4527be57d..16d71d5057b02 100644 --- a/onnxruntime/python/tools/transformers/models/sam2/benchmark_sam2.py +++ b/onnxruntime/python/tools/transformers/models/sam2/benchmark_sam2.py @@ -11,8 +11,8 @@ import csv import statistics import time +from collections.abc import Mapping from datetime import datetime -from typing import List, Mapping, Optional import torch from image_decoder import SAM2ImageDecoder @@ -84,7 +84,7 @@ def __init__( def __repr__(self): return f"{vars(self)}" - def shape_dict(self) -> Mapping[str, List[int]]: + def shape_dict(self) -> Mapping[str, list[int]]: if self.component == "image_encoder": return encoder_shape_dict(self.batch_size, self.height, self.width) else: @@ -283,7 +283,7 @@ def run_torch(config: TestConfig): def run_test( args: argparse.Namespace, - csv_writer: Optional[csv.DictWriter] = None, + csv_writer: csv.DictWriter | None = None, ): use_gpu: bool = args.use_gpu enable_cuda_graph: bool = args.use_cuda_graph diff --git a/onnxruntime/python/tools/transformers/models/sam2/image_encoder.py b/onnxruntime/python/tools/transformers/models/sam2/image_encoder.py index b9f30d0371dbe..c5ce339732063 100644 --- a/onnxruntime/python/tools/transformers/models/sam2/image_encoder.py +++ b/onnxruntime/python/tools/transformers/models/sam2/image_encoder.py @@ -75,7 +75,7 @@ def forward( feats = [ feat.permute(1, 2, 0).reshape(1, -1, *feat_size) - for feat, feat_size in zip(vision_feats[::-1], feat_sizes[::-1]) + for feat, feat_size in zip(vision_feats[::-1], feat_sizes[::-1], strict=False) ][::-1] if nvtx_helper is not None: diff --git a/onnxruntime/python/tools/transformers/models/sam2/sam2_demo.py b/onnxruntime/python/tools/transformers/models/sam2/sam2_demo.py index af6b0e17e77f1..7f43724a6343f 100644 --- a/onnxruntime/python/tools/transformers/models/sam2/sam2_demo.py +++ b/onnxruntime/python/tools/transformers/models/sam2/sam2_demo.py @@ -3,7 +3,6 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- import os -from typing import Union import matplotlib.image as mpimg import matplotlib.pyplot as plt @@ -64,7 +63,7 @@ def show_masks( output_image_file_prefix=None, image_files=None, ): - for i, (mask, score) in enumerate(zip(masks, scores)): + for i, (mask, score) in enumerate(zip(masks, scores, strict=False)): plt.figure(figsize=(10, 10)) plt.imshow(image) show_mask(mask, plt.gca(), borders=borders) @@ -92,7 +91,7 @@ def show_masks( def get_predictor( sam2_dir: str, - device: Union[str, torch.device], + device: str | torch.device, dtype: torch.dtype, model_type="sam2_hiera_large", engine="torch", @@ -303,7 +302,7 @@ def run_demo( def show_all_images(left_images, right_images, suffix=""): # Show images in two rows since display screen is horizontal in most cases. fig, axes = plt.subplots(nrows=2, ncols=len(left_images), figsize=(19.20, 10.80)) - for i, (left_img_path, right_img_path) in enumerate(zip(left_images, right_images)): + for i, (left_img_path, right_img_path) in enumerate(zip(left_images, right_images, strict=False)): left_img = mpimg.imread(left_img_path) right_img = mpimg.imread(right_img_path) diff --git a/onnxruntime/python/tools/transformers/models/sam2/sam2_image_onnx_predictor.py b/onnxruntime/python/tools/transformers/models/sam2/sam2_image_onnx_predictor.py index 3c0c886b877f0..2f34bfa9aa09a 100644 --- a/onnxruntime/python/tools/transformers/models/sam2/sam2_image_onnx_predictor.py +++ b/onnxruntime/python/tools/transformers/models/sam2/sam2_image_onnx_predictor.py @@ -4,7 +4,6 @@ # -------------------------------------------------------------------------- import logging -from typing import Optional, Tuple, Union import numpy as np import torch @@ -41,7 +40,7 @@ def create_session( onnx_path: str, session_options=None, provider="CUDAExecutionProvider", - device: Union[str, torch.device] = "cuda", + device: str | torch.device = "cuda", enable_cuda_graph=False, ) -> CudaSession: ort_session = create_ort_session( @@ -59,7 +58,7 @@ def __init__( image_decoder_onnx_path: str = "", image_decoder_multi_onnx_path: str = "", provider: str = "CUDAExecutionProvider", - device: Union[str, torch.device] = "cuda", + device: str | torch.device = "cuda", onnx_dtype: torch.dtype = torch.float32, mask_threshold=0.0, max_hole_area=0.0, @@ -114,7 +113,7 @@ def __init__( ) @torch.no_grad() - def set_image(self, image: Union[np.ndarray, Image]): + def set_image(self, image: np.ndarray | Image): """ Calculates the image embeddings for the provided image. @@ -162,14 +161,14 @@ def set_image(self, image: Union[np.ndarray, Image]): @torch.no_grad() def _predict( self, - point_coords: Optional[torch.Tensor], - point_labels: Optional[torch.Tensor], - boxes: Optional[torch.Tensor] = None, - mask_input: Optional[torch.Tensor] = None, + point_coords: torch.Tensor | None, + point_labels: torch.Tensor | None, + boxes: torch.Tensor | None = None, + mask_input: torch.Tensor | None = None, multimask_output: bool = True, return_logits: bool = False, img_idx: int = -1, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ Predict masks for the given input prompts, using the currently set image. Input prompts are batched torch tensors and are expected to already be diff --git a/onnxruntime/python/tools/transformers/models/sam2/sam2_utils.py b/onnxruntime/python/tools/transformers/models/sam2/sam2_utils.py index 4ec4ccc274291..d983cefaaaeec 100644 --- a/onnxruntime/python/tools/transformers/models/sam2/sam2_utils.py +++ b/onnxruntime/python/tools/transformers/models/sam2/sam2_utils.py @@ -5,7 +5,7 @@ import logging import os import sys -from typing import List, Mapping, Union +from collections.abc import Mapping import torch from sam2.build_sam import build_sam2 @@ -27,7 +27,7 @@ def _get_model_cfg(model_type) -> str: return model_cfg -def load_sam2_model(sam2_dir, model_type, device: Union[str, torch.device] = "cpu") -> SAM2Base: +def load_sam2_model(sam2_dir, model_type, device: str | torch.device = "cpu") -> SAM2Base: checkpoints_dir = os.path.join(sam2_dir, "checkpoints") sam2_config_dir = os.path.join(sam2_dir, "sam2_configs") if not os.path.exists(sam2_dir): @@ -65,7 +65,7 @@ def sam2_onnx_path(output_dir, model_type, component, multimask_output=False, su ) -def encoder_shape_dict(batch_size: int, height: int, width: int) -> Mapping[str, List[int]]: +def encoder_shape_dict(batch_size: int, height: int, width: int) -> Mapping[str, list[int]]: assert height == 1024 and width == 1024, "Only 1024x1024 images are supported." return { "image": [batch_size, 3, height, width], diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py index a50940933eb82..30f4663100d8a 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py @@ -23,7 +23,7 @@ import os import sys from importlib.metadata import PackageNotFoundError, version -from typing import Any, Dict, List, Optional +from typing import Any import controlnet_aux import cv2 @@ -307,7 +307,7 @@ def max_batch(args): return max_batch_size -def get_metadata(args, is_xl: bool = False) -> Dict[str, Any]: +def get_metadata(args, is_xl: bool = False) -> dict[str, Any]: metadata = { "command": " ".join(['"' + x + '"' if " " in x else x for x in sys.argv]), "args.prompt": args.prompt, @@ -410,7 +410,7 @@ def initialize_pipeline( lora_scale: float = 1.0, use_fp16_vae: bool = True, use_vae: bool = True, - framework_model_dir: Optional[str] = None, + framework_model_dir: str | None = None, max_cuda_graphs: int = 1, ): pipeline_info = PipelineInfo( @@ -649,7 +649,7 @@ def get_canny_image(image) -> Image.Image: return image -def process_controlnet_images_xl(args) -> List[Image.Image]: +def process_controlnet_images_xl(args) -> list[Image.Image]: """ Process control image for SDXL control net. """ diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py index c2cfc165e32cf..8dcda8a7633ac 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_models.py @@ -24,7 +24,6 @@ import logging import os import tempfile -from typing import Dict, List, Optional import onnx import onnx_graphsurgeon as gs @@ -135,7 +134,7 @@ def is_xl_refiner(self) -> bool: def use_safetensors(self) -> bool: return self.is_xl() or self.version in ["sd-turbo"] - def stages(self) -> List[str]: + def stages(self) -> list[str]: if self.is_xl_base_or_turbo(): return ["clip", "clip2", "unetxl"] + (["vae"] if self._use_vae else []) @@ -150,11 +149,11 @@ def vae_scaling_factor(self) -> float: def vae_torch_fallback(self) -> bool: return self.is_xl() and not self._use_fp16_vae - def custom_fp16_vae(self) -> Optional[str]: + def custom_fp16_vae(self) -> str | None: # For SD XL, use a VAE that fine-tuned to run in fp16 precision without generating NaNs return "madebyollin/sdxl-vae-fp16-fix" if self._use_fp16_vae and self.is_xl() else None - def custom_unet(self) -> Optional[str]: + def custom_unet(self) -> str | None: return "latent-consistency/lcm-sdxl" if self._use_lcm and self.is_xl_base() else None @staticmethod @@ -372,13 +371,13 @@ def from_pretrained(self, model_class, framework_model_dir, subfolder=None, mode def load_model(self, framework_model_dir: str, subfolder: str): pass - def get_input_names(self) -> List[str]: + def get_input_names(self) -> list[str]: pass - def get_output_names(self) -> List[str]: + def get_output_names(self) -> list[str]: pass - def get_dynamic_axes(self) -> Dict[str, Dict[int, str]]: + def get_dynamic_axes(self) -> dict[str, dict[int, str]]: pass def get_sample_input(self, batch_size, image_height, image_width) -> tuple: @@ -418,7 +417,7 @@ def get_input_profile(self, batch_size, image_height, image_width, static_batch, def get_shape_dict(self, batch_size, image_height, image_width): pass - def fp32_input_output_names(self) -> List[str]: + def fp32_input_output_names(self) -> list[str]: """For CUDA EP, we export ONNX model with FP32 first, then convert it to mixed precision model. This is a list of input or output names that are kept as float32 in optimized model. """ @@ -720,7 +719,7 @@ def __init__(self, unet, controlnets: ControlNetModel): def forward(self, sample, timestep, encoder_hidden_states, controlnet_images, controlnet_scales): for i, (controlnet_image, conditioning_scale, controlnet) in enumerate( - zip(controlnet_images, controlnet_scales, self.controlnets) + zip(controlnet_images, controlnet_scales, self.controlnets, strict=False) ): down_samples, mid_sample = controlnet( sample, @@ -739,7 +738,7 @@ def forward(self, sample, timestep, encoder_hidden_states, controlnet_images, co else: down_block_res_samples = [ samples_prev + samples_curr - for samples_prev, samples_curr in zip(down_block_res_samples, down_samples) + for samples_prev, samples_curr in zip(down_block_res_samples, down_samples, strict=False) ] mid_block_res_sample += mid_sample @@ -772,7 +771,7 @@ def forward( ): added_cond_kwargs = {"text_embeds": text_embeds, "time_ids": time_ids} for i, (controlnet_image, conditioning_scale, controlnet) in enumerate( - zip(controlnet_images, controlnet_scales, self.controlnets) + zip(controlnet_images, controlnet_scales, self.controlnets, strict=False) ): down_samples, mid_sample = controlnet( sample, @@ -790,7 +789,7 @@ def forward( else: down_block_res_samples = [ samples_prev + samples_curr - for samples_prev, samples_curr in zip(down_block_res_samples, down_samples) + for samples_prev, samples_curr in zip(down_block_res_samples, down_samples, strict=False) ] mid_block_res_sample += mid_sample @@ -1152,7 +1151,7 @@ def __init__( device, max_batch_size, fp16: bool = False, - custom_fp16_vae: Optional[str] = None, + custom_fp16_vae: str | None = None, ): super().__init__( pipeline_info, @@ -1232,7 +1231,7 @@ def get_sample_input(self, batch_size, image_height, image_width): dtype = torch.float16 if self.fp16 else torch.float32 return (torch.randn(batch_size, 4, latent_height, latent_width, dtype=dtype, device=self.device),) - def fp32_input_output_names(self) -> List[str]: + def fp32_input_output_names(self) -> list[str]: return [] diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_schedulers.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_schedulers.py index 41d2d267c5e11..ff23874000019 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_schedulers.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/diffusion_schedulers.py @@ -21,7 +21,6 @@ # limitations under the License. # -------------------------------------------------------------------------- -from typing import List, Optional import numpy as np import torch @@ -391,8 +390,8 @@ def __init__( predict_x0: bool = True, solver_type: str = "bh2", lower_order_final: bool = True, - disable_corrector: Optional[List[int]] = None, - use_karras_sigmas: Optional[bool] = False, + disable_corrector: list[int] | None = None, + use_karras_sigmas: bool | None = False, timestep_spacing: str = "linspace", steps_offset: int = 0, sigma_min=None, @@ -627,7 +626,7 @@ def multistep_uni_p_bh_update( model_output: torch.FloatTensor, *args, sample: torch.FloatTensor = None, - order: Optional[int] = None, + order: int | None = None, **kwargs, ) -> torch.FloatTensor: prev_timestep = args[0] if len(args) > 0 else kwargs.pop("prev_timestep", None) @@ -734,7 +733,7 @@ def multistep_uni_c_bh_update( *args, last_sample: torch.FloatTensor = None, this_sample: torch.FloatTensor = None, - order: Optional[int] = None, + order: int | None = None, **kwargs, ) -> torch.FloatTensor: this_timestep = args[0] if len(args) > 0 else kwargs.pop("this_timestep", None) @@ -1084,7 +1083,7 @@ def step( model_output: torch.FloatTensor, timestep: int, sample: torch.FloatTensor, - generator: Optional[torch.Generator] = None, + generator: torch.Generator | None = None, ): if self.num_inference_steps is None: raise ValueError( diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder.py index 7609ae10fc96d..d36411a1fa84d 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder.py @@ -5,7 +5,6 @@ import hashlib import os from enum import Enum -from typing import Optional import torch from diffusion_models import CLIP, VAE, CLIPWithProj, PipelineInfo, UNet, UNetXL @@ -275,7 +274,7 @@ def vae_decode(self, latents): def get_engine_paths( - work_dir: str, pipeline_info: PipelineInfo, engine_type: EngineType, framework_model_dir: Optional[str] = None + work_dir: str, pipeline_info: PipelineInfo, engine_type: EngineType, framework_model_dir: str | None = None ): root_dir = work_dir or "." short_name = pipeline_info.short_name() diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_ort_cuda.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_ort_cuda.py index 56012e223b18c..040e3a38dbc52 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_ort_cuda.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_ort_cuda.py @@ -6,7 +6,6 @@ import gc import logging import os -from typing import Dict, List, Optional import onnx import torch @@ -72,7 +71,7 @@ def metadata(self, name: str): data[f"{name}.gpu_graph_id"] = self.current_gpu_binding.last_run_gpu_graph_id return data - def infer(self, feed_dict: Dict[str, torch.Tensor]): + def infer(self, feed_dict: dict[str, torch.Tensor]): return self.current_gpu_binding.infer(feed_dict=feed_dict, disable_cuda_graph_in_run=not self.enable_cuda_graph) def allocate_buffers(self, shape_dict, device): @@ -93,7 +92,7 @@ def __init__( onnx_opset_version: int, use_cuda_graph: bool, fp16: bool = True, - force_fp32_ops: Optional[List[str]] = None, + force_fp32_ops: list[str] | None = None, optimize_by_ort: bool = True, ): self.onnx_opset_version = onnx_opset_version @@ -140,7 +139,7 @@ def _configure( onnx_opset_version: int, use_cuda_graph: bool, fp16: bool = True, - force_fp32_ops: Optional[List[str]] = None, + force_fp32_ops: list[str] | None = None, optimize_by_ort: bool = True, ): self.model_config[model_name] = _ModelConfig( @@ -238,11 +237,11 @@ def build_engines( engine_dir: str, framework_model_dir: str, onnx_dir: str, - tmp_dir: Optional[str] = None, + tmp_dir: str | None = None, onnx_opset_version: int = 17, device_id: int = 0, save_fp32_intermediate_model: bool = False, - import_engine_dir: Optional[str] = None, + import_engine_dir: str | None = None, max_cuda_graphs: int = 1, ): self.torch_device = torch.device("cuda", device_id) diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py index 52d332848357f..24897756b2d7a 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py @@ -21,7 +21,6 @@ import shutil import tempfile from pathlib import Path -from typing import List, Optional import __init__ # noqa: F401. Walk-around to run this script directly import coloredlogs @@ -101,11 +100,11 @@ def _optimize_sd_pipeline( source_dir: Path, target_dir: Path, pipeline_type: str, - model_list: List[str], - use_external_data_format: Optional[bool], + model_list: list[str], + use_external_data_format: bool | None, float16: bool, bfloat16: bool, - force_fp32_ops: List[str], + force_fp32_ops: list[str], enable_runtime_optimization: bool, args, ): @@ -400,7 +399,7 @@ def _optimize_sd_pipeline( return op_counters -def _copy_extra_directory(source_dir: Path, target_dir: Path, model_list: List[str]): +def _copy_extra_directory(source_dir: Path, target_dir: Path, model_list: list[str]): """Copy extra directory that does not have onnx model Args: @@ -448,7 +447,7 @@ def optimize_stable_diffusion_pipeline( input_dir: str, output_dir: str, overwrite: bool, - use_external_data_format: Optional[bool], + use_external_data_format: bool | None, float16: bool, enable_runtime_optimization: bool, args, @@ -480,7 +479,7 @@ def optimize_stable_diffusion_pipeline( ) -def parse_arguments(argv: Optional[List[str]] = None): +def parse_arguments(argv: list[str] | None = None): """Parse arguments Returns: @@ -570,7 +569,7 @@ def parse_arguments(argv: Optional[List[str]] = None): return args -def main(argv: Optional[List[str]] = None): +def main(argv: list[str] | None = None): args = parse_arguments(argv) logger.info("Arguments: %s", str(args)) diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py index ac955f50141d2..e2f202e32221d 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py @@ -24,7 +24,7 @@ import pathlib import random import time -from typing import Any, Dict, List, Optional +from typing import Any import numpy as np import nvtx @@ -485,7 +485,7 @@ def decode_latent(self, latents): self.stop_profile("vae") return images - def print_summary(self, tic, toc, batch_size, vae_enc=False, pil=False) -> Dict[str, Any]: + def print_summary(self, tic, toc, batch_size, vae_enc=False, pil=False) -> dict[str, Any]: throughput = batch_size / (toc - tic) latency_clip = cudart.cudaEventElapsedTime(self.events["clip-start"], self.events["clip-stop"])[1] latency_unet = cudart.cudaEventElapsedTime(self.events["denoise-start"], self.events["denoise-stop"])[1] @@ -546,7 +546,7 @@ def pt_to_numpy(images: torch.FloatTensor): """ return ((images + 1) / 2).clamp(0, 1).detach().permute(0, 2, 3, 1).float().cpu().numpy() - def metadata(self) -> Dict[str, Any]: + def metadata(self) -> dict[str, Any]: data = { "actual_steps": self.actual_steps, "seed": self.get_current_seed(), @@ -561,7 +561,7 @@ def metadata(self) -> Dict[str, Any]: return data - def save_images(self, images: List, prompt: List[str], negative_prompt: List[str], metadata: Dict[str, Any]): + def save_images(self, images: list, prompt: list[str], negative_prompt: list[str], metadata: dict[str, Any]): session_id = str(random.randint(1000, 9999)) for i, image in enumerate(images): seed = str(self.get_current_seed()) @@ -747,17 +747,17 @@ def _infer( def run( self, - prompt: List[str], - negative_prompt: List[str], + prompt: list[str], + negative_prompt: list[str], image_height: int, image_width: int, denoising_steps: int = 30, guidance: float = 5.0, - seed: Optional[int] = None, - image: Optional[torch.Tensor] = None, + seed: int | None = None, + image: torch.Tensor | None = None, strength: float = 0.3, - controlnet_images: Optional[torch.Tensor] = None, - controlnet_scales: Optional[torch.Tensor] = None, + controlnet_images: torch.Tensor | None = None, + controlnet_scales: torch.Tensor | None = None, show_latency: bool = False, output_type: str = "pil", deterministic: bool = False, diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/test/check_image.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/test/check_image.py index 86477a7e3168b..ab3d3d8f58545 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/test/check_image.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/test/check_image.py @@ -1,6 +1,5 @@ import argparse import os -from typing import Optional import cv2 import open_clip @@ -19,7 +18,7 @@ def arg_parser(): return args -def image_encoder(img: Image.Image, cache_dir: Optional[str] = None): # -> torch.Tensor: +def image_encoder(img: Image.Image, cache_dir: str | None = None): # -> torch.Tensor: device = "cuda" if torch.cuda.is_available() else "cpu" model, _, preprocess = open_clip.create_model_and_transforms( "ViT-B-16-plus-240", pretrained="laion400m_e32", cache_dir=cache_dir @@ -46,7 +45,7 @@ def load_image(image_path: str): # -> Image.Image: return img -def generate_score(image1: str, image2: str, cache_dir: Optional[str] = None): # -> float: +def generate_score(image1: str, image2: str, cache_dir: str | None = None): # -> float: test_img = load_image(image1) data_img = load_image(image2) img1 = image_encoder(test_img, cache_dir) diff --git a/onnxruntime/python/tools/transformers/models/t5/past_helper.py b/onnxruntime/python/tools/transformers/models/t5/past_helper.py index 915b09da79fe6..0f72a89498dad 100644 --- a/onnxruntime/python/tools/transformers/models/t5/past_helper.py +++ b/onnxruntime/python/tools/transformers/models/t5/past_helper.py @@ -5,7 +5,6 @@ # -------------------------------------------------------------------------- import logging -from typing import List, Tuple import torch @@ -71,7 +70,7 @@ def group_by_layer(past, num_layers): ) @staticmethod - def back_group_by_layer(past_key_values: Tuple[Tuple[torch.Tensor]]): + def back_group_by_layer(past_key_values: tuple[tuple[torch.Tensor]]): """Categorize present_key_values from self and cross attention to layer by layer. Reorder past state from grouped by self/cross attention to grouped by layer. @@ -101,7 +100,7 @@ def back_group_by_layer(past_key_values: Tuple[Tuple[torch.Tensor]]): return past_tuples @staticmethod - def group_by_self_and_cross(present_key_values: Tuple[torch.Tensor], concat: bool = False): + def group_by_self_and_cross(present_key_values: tuple[torch.Tensor], concat: bool = False): """Categorize present_key_values into self and cross attention. Split present state from grouped by layer to grouped by self/cross attention. @@ -118,8 +117,8 @@ def group_by_self_and_cross(present_key_values: Tuple[torch.Tensor], concat: boo present_self (Tuple[torch.Tensor]): present key and values from self attention present_cross (Tuple[torch.Tensor]): present key and values from cross attention """ - present_self: List[torch.Tensor] = [] - present_cross: List[torch.Tensor] = [] + present_self: list[torch.Tensor] = [] + present_cross: list[torch.Tensor] = [] for _, present_layer_i in enumerate(present_key_values): assert len(present_layer_i) == 4, f"Expected to have four items. Got {len(present_layer_i)}" present_key_self, present_value_self, present_key_cross, present_value_cross = present_layer_i @@ -131,7 +130,7 @@ def group_by_self_and_cross(present_key_values: Tuple[torch.Tensor], concat: boo return present_self, present_cross @staticmethod - def get_input_names(past_key_values: Tuple[Tuple[torch.Tensor]], encoder=True): + def get_input_names(past_key_values: tuple[tuple[torch.Tensor]], encoder=True): """Process input names of model wrapper. Args: diff --git a/onnxruntime/python/tools/transformers/models/t5/t5_decoder.py b/onnxruntime/python/tools/transformers/models/t5/t5_decoder.py index 19e6bba22dc1a..a93c1705b2cd9 100644 --- a/onnxruntime/python/tools/transformers/models/t5/t5_decoder.py +++ b/onnxruntime/python/tools/transformers/models/t5/t5_decoder.py @@ -8,7 +8,6 @@ import os import tempfile from pathlib import Path -from typing import List, Optional, Union import numpy import onnx @@ -34,8 +33,8 @@ def __init__( self, decoder: torch.nn.Module, lm_head: torch.nn.Module, - config: Union[T5Config, MT5Config], - decoder_start_token_id: Optional[int] = None, + config: T5Config | MT5Config, + decoder_start_token_id: int | None = None, ): super().__init__() self.decoder = decoder @@ -133,11 +132,11 @@ def __init__( ): self.decoder_input_ids: torch.LongTensor = decoder_input_ids self.encoder_attention_mask: torch.LongTensor = encoder_attention_mask - self.past_key_values: Union[List[torch.FloatTensor], List[torch.HalfTensor], None] = past_key_values + self.past_key_values: list[torch.FloatTensor] | list[torch.HalfTensor] | None = past_key_values @staticmethod def create_dummy( - config: Union[T5Config, MT5Config], + config: T5Config | MT5Config, batch_size: int, encode_sequence_length: int, past_decode_sequence_length: int, @@ -211,7 +210,7 @@ def create_dummy( return T5DecoderInputs(decoder_input_ids, encoder_inputs.attention_mask, past) - def to_list(self) -> List: + def to_list(self) -> list: input_list = [ self.decoder_input_ids, self.encoder_attention_mask, @@ -232,7 +231,7 @@ def to_fp32(self): class T5DecoderHelper: @staticmethod def export_onnx( - decoder: Union[T5Decoder, T5DecoderInit], + decoder: T5Decoder | T5DecoderInit, device: torch.device, onnx_model_path: str, verbose: bool = True, @@ -370,7 +369,7 @@ def onnxruntime_inference(ort_session, inputs: T5DecoderInputs): @staticmethod def verify_onnx( - model: Union[T5Decoder, T5DecoderInit], + model: T5Decoder | T5DecoderInit, ort_session: InferenceSession, device: torch.device, use_int32_inputs: bool, diff --git a/onnxruntime/python/tools/transformers/models/t5/t5_encoder.py b/onnxruntime/python/tools/transformers/models/t5/t5_encoder.py index fb61e970c1e0c..c6b0f7ee3adc2 100644 --- a/onnxruntime/python/tools/transformers/models/t5/t5_encoder.py +++ b/onnxruntime/python/tools/transformers/models/t5/t5_encoder.py @@ -9,7 +9,6 @@ import random import tempfile from pathlib import Path -from typing import List, Union import numpy import onnx @@ -26,7 +25,7 @@ class T5Encoder(torch.nn.Module): """T5 encoder outputs only the last hidden state""" - def __init__(self, encoder, config: Union[T5Config, MT5Config]): + def __init__(self, encoder, config: T5Config | MT5Config): super().__init__() self.encoder = encoder self.config = config @@ -72,7 +71,7 @@ def create_dummy( attention_mask[i, :padding_position] = 0 return T5EncoderInputs(input_ids, attention_mask) - def to_list(self) -> List: + def to_list(self) -> list: input_list = [v for v in [self.input_ids, self.attention_mask] if v is not None] return input_list diff --git a/onnxruntime/python/tools/transformers/models/t5/t5_encoder_decoder_init.py b/onnxruntime/python/tools/transformers/models/t5/t5_encoder_decoder_init.py index fd6ea45ef8b7c..c76d7aabdf11a 100644 --- a/onnxruntime/python/tools/transformers/models/t5/t5_encoder_decoder_init.py +++ b/onnxruntime/python/tools/transformers/models/t5/t5_encoder_decoder_init.py @@ -8,7 +8,6 @@ import os import tempfile from pathlib import Path -from typing import List, Optional, Union import numpy import onnx @@ -33,8 +32,8 @@ def __init__( encoder: torch.nn.Module, decoder: torch.nn.Module, lm_head: torch.nn.Module, - config: Union[T5Config, MT5Config], - decoder_start_token_id: Optional[int] = None, + config: T5Config | MT5Config, + decoder_start_token_id: int | None = None, ): super().__init__() self.config = config @@ -62,7 +61,7 @@ def __init__(self, encoder_input_ids, encoder_attention_mask, decoder_input_ids= @staticmethod def create_dummy( - config: Union[T5Config, MT5Config], + config: T5Config | MT5Config, batch_size: int, encode_sequence_length: int, use_decoder_input_ids: int, @@ -83,7 +82,7 @@ def create_dummy( return T5EncoderDecoderInitInputs(encoder_inputs.input_ids, encoder_inputs.attention_mask, decoder_input_ids) - def to_list(self) -> List: + def to_list(self) -> list: input_list = [self.encoder_input_ids, self.encoder_attention_mask] if self.decoder_input_ids is not None: input_list.append(self.decoder_input_ids) diff --git a/onnxruntime/python/tools/transformers/models/t5/t5_helper.py b/onnxruntime/python/tools/transformers/models/t5/t5_helper.py index f7dc9db0e82c8..d3f25e979887d 100755 --- a/onnxruntime/python/tools/transformers/models/t5/t5_helper.py +++ b/onnxruntime/python/tools/transformers/models/t5/t5_helper.py @@ -7,7 +7,6 @@ import logging import os from pathlib import Path -from typing import Dict, List, Union import torch from float16 import float_to_float16_max_diff @@ -64,7 +63,7 @@ def load_model( merge_encoder_and_decoder_init: bool = True, model_type: str = "t5", state_dict_path: str = "", - ) -> Dict[str, torch.nn.Module]: + ) -> dict[str, torch.nn.Module]: """Load model given a pretrained name or path, then build models for ONNX conversion. Args: @@ -111,7 +110,7 @@ def load_model( @staticmethod def export_onnx( - model: Union[T5Encoder, T5Decoder, T5DecoderInit, T5EncoderDecoderInit], + model: T5Encoder | T5Decoder | T5DecoderInit | T5EncoderDecoderInit, device: torch.device, onnx_model_path: str, verbose: bool = True, @@ -151,7 +150,7 @@ def export_onnx( @staticmethod def auto_mixed_precision( onnx_model: OnnxModel, - op_block_list: List[str] = [ # noqa: B006 + op_block_list: list[str] = [ # noqa: B006 "SimplifiedLayerNormalization", "SkipSimplifiedLayerNormalization", "Relu", @@ -257,7 +256,7 @@ def optimize_onnx( @staticmethod def verify_onnx( - model: Union[T5Encoder, T5Decoder, T5DecoderInit, T5EncoderDecoderInit], + model: T5Encoder | T5Decoder | T5DecoderInit | T5EncoderDecoderInit, ort_session: InferenceSession, device: torch.device, use_int32_inputs: bool, diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py index 87ac45101f0c0..feb688948d8f5 100644 --- a/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py +++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_chain.py @@ -24,7 +24,7 @@ def verify_inputs(beam_inputs, graph_inputs): # Verify that ONNX graph's inputs match beam search op's inputs beam_required_inputs = list(filter(lambda beam_input: beam_input, beam_inputs)) assert len(graph_inputs) == len(beam_required_inputs) - for graph_input, beam_input in zip(graph_inputs, beam_required_inputs): + for graph_input, beam_input in zip(graph_inputs, beam_required_inputs, strict=False): # Check if graph_input is in beam_input to handle beam_input names with the "_fp16" suffix assert graph_input.name in beam_input diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py index 5da235d72ca0b..400cafc4c93c3 100644 --- a/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py +++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py @@ -8,7 +8,6 @@ import os import tempfile from pathlib import Path -from typing import List, Optional, Union import numpy import onnx @@ -34,7 +33,7 @@ def __init__( self, decoder: torch.nn.Module, config: WhisperConfig, - decoder_start_token_id: Optional[int] = None, + decoder_start_token_id: int | None = None, ): super().__init__() self.decoder = decoder @@ -115,7 +114,7 @@ def __init__( past_key_values=None, ): self.decoder_input_ids: torch.LongTensor = decoder_input_ids - self.past_key_values: Union[List[torch.FloatTensor], List[torch.HalfTensor], None] = past_key_values + self.past_key_values: list[torch.FloatTensor] | list[torch.HalfTensor] | None = past_key_values @staticmethod def create_dummy( @@ -186,7 +185,7 @@ def create_dummy( return WhisperDecoderInputs(decoder_input_ids, past) - def to_list(self) -> List: + def to_list(self) -> list: input_list = [self.decoder_input_ids] if self.past_key_values: input_list.extend(self.past_key_values) @@ -333,7 +332,7 @@ def onnxruntime_inference(ort_session, inputs: WhisperDecoderInputs): @staticmethod def verify_onnx( - model: Union[WhisperDecoder, WhisperDecoderInit], + model: WhisperDecoder | WhisperDecoderInit, ort_session: InferenceSession, device: torch.device, use_int32_inputs: bool, diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder.py index 93281848a5c9c..0b9db81486caa 100644 --- a/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder.py +++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder.py @@ -8,7 +8,6 @@ import os import tempfile from pathlib import Path -from typing import List import numpy import onnx @@ -67,7 +66,7 @@ def create_dummy( ) return WhisperEncoderInputs(input_features) - def to_list(self) -> List: + def to_list(self) -> list: if self.input_ids is None: return [] return [self.input_ids] diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder_decoder_init.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder_decoder_init.py index fab2a2aa4c8a8..c7c7a7675c1a7 100644 --- a/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder_decoder_init.py +++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_encoder_decoder_init.py @@ -8,7 +8,6 @@ import os import tempfile from pathlib import Path -from typing import List, Optional import numpy import onnx @@ -34,7 +33,7 @@ def __init__( encoder: torch.nn.Module, decoder: torch.nn.Module, config: WhisperConfig, - decoder_start_token_id: Optional[int] = None, + decoder_start_token_id: int | None = None, model_impl: str = "hf", model: torch.nn.Module = None, ): @@ -94,7 +93,7 @@ def create_dummy( return WhisperEncoderDecoderInitInputs(encoder_inputs.input_ids, decoder_input_ids) - def to_list(self) -> List: + def to_list(self) -> list: input_list = [self.encoder_input_ids] if self.decoder_input_ids is not None: input_list.append(self.decoder_input_ids) diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_helper.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_helper.py index 9fb51dd9b43c0..38003c2693296 100644 --- a/onnxruntime/python/tools/transformers/models/whisper/whisper_helper.py +++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_helper.py @@ -7,7 +7,6 @@ import logging import os from pathlib import Path -from typing import Dict, Tuple, Union import numpy as np import torch @@ -117,7 +116,7 @@ def load_model( device: torch.device, merge_encoder_and_decoder_init: bool = True, state_dict_path: str = "", - ) -> Dict[str, torch.nn.Module]: + ) -> dict[str, torch.nn.Module]: """Load model given a pretrained name or path, then build models for ONNX conversion. Args: @@ -170,7 +169,7 @@ def load_model( @staticmethod def export_onnx( - model: Union[WhisperEncoder, WhisperDecoder, WhisperDecoderInit, WhisperEncoderDecoderInit], + model: WhisperEncoder | WhisperDecoder | WhisperDecoderInit | WhisperEncoderDecoderInit, device: torch.device, onnx_model_path: str, verbose: bool = True, @@ -209,7 +208,7 @@ def export_onnx( @staticmethod def auto_mixed_precision( onnx_model: OnnxModel, - op_block_list: Tuple[str] = ( + op_block_list: tuple[str] = ( "SimplifiedLayerNormalization", "SkipSimplifiedLayerNormalization", "Relu", @@ -460,7 +459,7 @@ def verify_onnx( } use_extra_decoding_ids = "extra_decoding_ids" in ort_names - for name, dtype in zip(ort_names, ort_dtypes): + for name, dtype in zip(ort_names, ort_dtypes, strict=False): if name == "input_features": inputs[name] = inputs[name].detach().cpu().numpy() elif name == "vocab_mask": diff --git a/onnxruntime/python/tools/transformers/onnx_model.py b/onnxruntime/python/tools/transformers/onnx_model.py index 33506d6d00cac..ef80d36be3b18 100644 --- a/onnxruntime/python/tools/transformers/onnx_model.py +++ b/onnxruntime/python/tools/transformers/onnx_model.py @@ -9,7 +9,6 @@ import sys from collections import deque from pathlib import Path -from typing import Dict, List, Optional, Tuple from float16 import convert_float_to_float16 from onnx import ( @@ -35,16 +34,16 @@ def __init__(self, model): def initialize(self, model): self.model: ModelProto = model - self._node_name_suffix: Dict[str, int] = {} # key is node name prefix, value is the last suffix generated + self._node_name_suffix: dict[str, int] = {} # key is node name prefix, value is the last suffix generated self.shape_infer_helper: SymbolicShapeInferenceHelper = None self.enable_shape_infer: bool = True - self.all_graphs: Optional[List[GraphProto]] = None + self.all_graphs: list[GraphProto] | None = None # Cache of shape and data type from onnx graph to speed up optimization. # Be careful that fusion shall not reuse node output name for different shape/type (in adding/removing nodes) # Note that these do not cache the symbolic shape inference result. - self._dtype_dict: Optional[Dict[str, int]] = None - self._shape_dict: Optional[Dict[str, List]] = None + self._dtype_dict: dict[str, int] | None = None + self._shape_dict: dict[str, list] | None = None def disable_shape_inference(self): self.enable_shape_infer = False @@ -348,7 +347,7 @@ def match_parent( def match_parent_paths(self, node, paths, output_name_to_node): for i, path in enumerate(paths): - assert isinstance(path, (List, Tuple)) + assert isinstance(path, (list, tuple)) return_indice = [] matched = self.match_parent_path(node, path[0], path[1], output_name_to_node, return_indice) if matched: @@ -358,7 +357,7 @@ def match_parent_paths(self, node, paths, output_name_to_node): def match_parent_paths_all(self, node, paths, output_name_to_node): match_i, matches, return_indices = [], [], [] for i, path in enumerate(paths): - assert isinstance(path, (List, Tuple)) + assert isinstance(path, (list, tuple)) return_indice = [] matched = self.match_parent_path(node, path[0], path[1], output_name_to_node, return_indice) if matched: @@ -442,7 +441,7 @@ def match_child_path( self, node, child_op_types, - edges: Optional[List[Tuple[int, int]]] = None, + edges: list[tuple[int, int]] | None = None, input_name_to_nodes=None, exclude=[], # noqa: B006 ): @@ -600,7 +599,7 @@ def tensor_shape_to_list(self, tensor_type): shape_list.append("?") # shall not happen return shape_list - def get_dtype(self, name: str, symbolic_shape_helper: Optional[SymbolicShapeInferenceHelper] = None): + def get_dtype(self, name: str, symbolic_shape_helper: SymbolicShapeInferenceHelper | None = None): """Try get data type given a name (could be initializer, input or output of graph or node).""" if self._dtype_dict is None: @@ -625,7 +624,7 @@ def get_dtype(self, name: str, symbolic_shape_helper: Optional[SymbolicShapeInfe return None - def get_shape(self, name: str, symbolic_shape_helper: Optional[SymbolicShapeInferenceHelper] = None): + def get_shape(self, name: str, symbolic_shape_helper: SymbolicShapeInferenceHelper | None = None): """Try get shape given a name (could be initializer, input or output of graph or node).""" if self._shape_dict is None: @@ -1320,8 +1319,8 @@ def to_data_hash(tensor: TensorProto, base_dir: str = "") -> int: def has_same_value( tensor1: TensorProto, tensor2: TensorProto, - signature_cache1: Optional[dict] = None, - signature_cache2: Optional[dict] = None, + signature_cache1: dict | None = None, + signature_cache2: dict | None = None, ) -> bool: """Returns True when two tensors have same value. Note that name can be different. @@ -1354,7 +1353,7 @@ def has_same_value( return False - def remove_duplicated_initializer(self, cache: Optional[dict] = None): + def remove_duplicated_initializer(self, cache: dict | None = None): """Remove initializers with duplicated values, and only keep the first one. It could help reduce size of models (like ALBert) with shared weights. If require_raw_data passed, method will only compare raw_data initializers to speed runtime diff --git a/onnxruntime/python/tools/transformers/onnx_model_bart.py b/onnxruntime/python/tools/transformers/onnx_model_bart.py index 61a786d7af60b..496146dbf8cb5 100644 --- a/onnxruntime/python/tools/transformers/onnx_model_bart.py +++ b/onnxruntime/python/tools/transformers/onnx_model_bart.py @@ -3,7 +3,6 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- import logging -from typing import Optional from fusion_attention import AttentionMask from fusion_bart_attention import FusionBartAttention @@ -127,7 +126,7 @@ def __init__(self, model, num_heads, hidden_size, model_impl="hf"): self.attention_fusion = FusionBartAttention(self, self.hidden_size, self.num_heads, self.attention_mask) self.bart_reshape_fusion_preprocess = FusionBartReshape(self) - def optimize(self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False): + def optimize(self, options: FusionOptions | None = None, add_dynamic_axes: bool = False): self.attention_fusion.use_multi_head_attention = False if options is None else options.use_multi_head_attention self.attention_fusion.disable_multi_head_attention_bias = ( False if options is None else options.disable_multi_head_attention_bias diff --git a/onnxruntime/python/tools/transformers/onnx_model_bert.py b/onnxruntime/python/tools/transformers/onnx_model_bert.py index 26464fc32817d..c4e8b64fd8486 100644 --- a/onnxruntime/python/tools/transformers/onnx_model_bert.py +++ b/onnxruntime/python/tools/transformers/onnx_model_bert.py @@ -4,7 +4,6 @@ # -------------------------------------------------------------------------- from logging import getLogger -from typing import List, Optional from convert_to_packing_mode import PackingMode from fusion_attention import AttentionMask, FusionAttention @@ -147,7 +146,7 @@ def fuse_qordered_mamtul(self): fusion = FusionQOrderedMatMul(self) fusion.apply() - def get_graph_inputs_from_node_type(self, op_type: str, input_indices: List[int], casted: bool): + def get_graph_inputs_from_node_type(self, op_type: str, input_indices: list[int], casted: bool): """ Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention). Returns a list of the graph input names based on the filter whether it is casted or not. @@ -323,7 +322,7 @@ def postprocess(self): self.clean_graph() self.prune_graph() - def optimize(self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False): + def optimize(self, options: FusionOptions | None = None, add_dynamic_axes: bool = False): if (options is not None) and not options.enable_shape_inference: self.disable_shape_inference() diff --git a/onnxruntime/python/tools/transformers/onnx_model_conformer.py b/onnxruntime/python/tools/transformers/onnx_model_conformer.py index 1506d85f53fd4..65723aabc2e18 100644 --- a/onnxruntime/python/tools/transformers/onnx_model_conformer.py +++ b/onnxruntime/python/tools/transformers/onnx_model_conformer.py @@ -3,7 +3,6 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- import logging -from typing import Optional from fusion_attention import AttentionMask from fusion_conformer_attention import FusionConformerAttention @@ -19,7 +18,7 @@ def __init__(self, model, num_heads, hidden_size): self.attention_mask = AttentionMask(self) self.attention_fusion = FusionConformerAttention(self, self.hidden_size, self.num_heads, self.attention_mask) - def optimize(self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False): + def optimize(self, options: FusionOptions | None = None, add_dynamic_axes: bool = False): self.attention_fusion.use_multi_head_attention = False if options is None else options.use_multi_head_attention self.attention_fusion.disable_multi_head_attention_bias = ( False if options is None else options.disable_multi_head_attention_bias diff --git a/onnxruntime/python/tools/transformers/onnx_model_mmdit.py b/onnxruntime/python/tools/transformers/onnx_model_mmdit.py index 4c9b19c0c97ca..35a574129e78c 100644 --- a/onnxruntime/python/tools/transformers/onnx_model_mmdit.py +++ b/onnxruntime/python/tools/transformers/onnx_model_mmdit.py @@ -4,7 +4,6 @@ # -------------------------------------------------------------------------- import logging -from typing import Optional from fusion_layernorm import FusionLayerNormalization from fusion_mha_mmdit import FusionMultiHeadAttentionMMDit @@ -47,7 +46,7 @@ def fuse_multi_head_attention(self): fusion = FusionMultiHeadAttentionMMDit(self) fusion.apply() - def optimize(self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False): + def optimize(self, options: FusionOptions | None = None, add_dynamic_axes: bool = False): assert not add_dynamic_axes if is_installed("tqdm"): @@ -62,7 +61,7 @@ def optimize(self, options: Optional[FusionOptions] = None, add_dynamic_axes: bo logger.info("tqdm is not installed. Run optimization without progress bar") self._optimize(options, None) - def _optimize(self, options: Optional[FusionOptions] = None, progress_bar=None): + def _optimize(self, options: FusionOptions | None = None, progress_bar=None): if (options is not None) and not options.enable_shape_inference: self.disable_shape_inference() diff --git a/onnxruntime/python/tools/transformers/onnx_model_phi.py b/onnxruntime/python/tools/transformers/onnx_model_phi.py index 5df765033578b..d2f10d0bc18af 100644 --- a/onnxruntime/python/tools/transformers/onnx_model_phi.py +++ b/onnxruntime/python/tools/transformers/onnx_model_phi.py @@ -4,7 +4,6 @@ # -------------------------------------------------------------------------- from logging import getLogger -from typing import List, Optional import numpy as np from dynamo_onnx_helper import DynamoOnnxHelper @@ -70,7 +69,7 @@ class Fission(Fusion): def __init__( self, model: OnnxModel, - nodes_to_find: List[str], + nodes_to_find: list[str], ): super().__init__(model, "DONOTUSE", nodes_to_find) @@ -129,7 +128,7 @@ def replace_fp32_value_info(self, name, shape): self.model.graph().value_info.extend([new_value_info]) def set_unique_name_and_add_nodes( - self, subgraph_nodes: List[NodeProto], layer_id: int, layer_known_edges_names: List[str] + self, subgraph_nodes: list[NodeProto], layer_id: int, layer_known_edges_names: list[str] ): for new_node in subgraph_nodes: for i, name in enumerate(new_node.input): @@ -148,7 +147,7 @@ def set_unique_name_and_add_nodes( self.nodes_to_add.append(new_node) self.node_name_to_graph_name[new_node.name] = self.this_graph_name - def layernorm(self, inputs: List[str], outputs: List[str], prefix: str = ""): + def layernorm(self, inputs: list[str], outputs: list[str], prefix: str = ""): assert len(inputs) == 3 assert len(outputs) == 1 node = helper.make_node( @@ -160,7 +159,7 @@ def layernorm(self, inputs: List[str], outputs: List[str], prefix: str = ""): ) return [node] - def gemm(self, inputs: List[str], outputs: List[str], prefix: str = ""): + def gemm(self, inputs: list[str], outputs: list[str], prefix: str = ""): assert len(inputs) == 3 assert len(outputs) == 1 matmul = helper.make_node( @@ -177,7 +176,7 @@ def gemm(self, inputs: List[str], outputs: List[str], prefix: str = ""): ) return [matmul, add] - def rotary(self, inputs: List[str], outputs: List[str], prefix: str = "", rot_dim=32, num_heads=32): + def rotary(self, inputs: list[str], outputs: list[str], prefix: str = "", rot_dim=32, num_heads=32): assert len(inputs) == 4 assert len(outputs) == 1 node = helper.make_node( @@ -191,7 +190,7 @@ def rotary(self, inputs: List[str], outputs: List[str], prefix: str = "", rot_di ) return [node] - def fastgelu(self, inputs: List[str], outputs: List[str], prefix: str = ""): + def fastgelu(self, inputs: list[str], outputs: list[str], prefix: str = ""): assert len(inputs) == 1 assert len(outputs) == 1 node = helper.make_node( @@ -203,7 +202,7 @@ def fastgelu(self, inputs: List[str], outputs: List[str], prefix: str = ""): ) return [node] - def add(self, inputs: List[str], outputs: List[str], prefix: str = ""): + def add(self, inputs: list[str], outputs: list[str], prefix: str = ""): assert len(inputs) == 2 assert len(outputs) == 1 node = helper.make_node( @@ -214,7 +213,7 @@ def add(self, inputs: List[str], outputs: List[str], prefix: str = ""): ) return [node] - def mha(self, inputs: List[str], outputs: List[str], prefix: str = "", num_heads=32): + def mha(self, inputs: list[str], outputs: list[str], prefix: str = "", num_heads=32): assert len(inputs) == 8 assert len(outputs) == 3 node = helper.make_node( @@ -228,7 +227,7 @@ def mha(self, inputs: List[str], outputs: List[str], prefix: str = "", num_heads ) return [node] - def gqa(self, inputs: List[str], outputs: List[str], prefix: str = "", num_heads=32): + def gqa(self, inputs: list[str], outputs: list[str], prefix: str = "", num_heads=32): assert len(inputs) == 7 assert len(outputs) == 3 node = helper.make_node( @@ -242,7 +241,7 @@ def gqa(self, inputs: List[str], outputs: List[str], prefix: str = "", num_heads ) return [node] - def attention(self, inputs: List[str], outputs: List[str], prefix: str = "", num_heads=32): + def attention(self, inputs: list[str], outputs: list[str], prefix: str = "", num_heads=32): assert len(inputs) == 5 assert len(outputs) == 2 node = helper.make_node( @@ -260,8 +259,8 @@ def attention(self, inputs: List[str], outputs: List[str], prefix: str = "", num def paged_attn( self, - inputs: List[str], - outputs: List[str], + inputs: list[str], + outputs: list[str], prefix: str = "", num_heads=32, head_size=80, @@ -853,7 +852,7 @@ def __init__(self, model: ModelProto, num_heads: int, hidden_size: int): self.fission_transformer_layernorm = FissionTransformerLayerNormPhi(self) self.fission_transformer_embedding = FissionTransformerEmbeddingPhi(self) - def optimize(self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False): + def optimize(self, options: FusionOptions | None = None, add_dynamic_axes: bool = False): assert options is not None attn_op_type = options.attention_op_type diff --git a/onnxruntime/python/tools/transformers/onnx_model_sam2.py b/onnxruntime/python/tools/transformers/onnx_model_sam2.py index ac608fb509a81..9d57081c4ce12 100644 --- a/onnxruntime/python/tools/transformers/onnx_model_sam2.py +++ b/onnxruntime/python/tools/transformers/onnx_model_sam2.py @@ -4,7 +4,6 @@ # -------------------------------------------------------------------------- import logging -from typing import Optional from fusion_attention_sam2 import FusionMultiHeadAttentionSam2 from fusion_layernorm import FusionLayerNormalizationNCHW @@ -39,11 +38,11 @@ def fuse_layer_norm(self): fusion = FusionLayerNormalizationNCHW(self) fusion.apply() - def fuse_multi_head_attention(self, options: Optional[FusionOptions] = None): + def fuse_multi_head_attention(self, options: FusionOptions | None = None): mha_fusion = FusionMultiHeadAttentionSam2(self, self.hidden_size, self.num_heads) mha_fusion.apply() - def optimize(self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False): + def optimize(self, options: FusionOptions | None = None, add_dynamic_axes: bool = False): if is_installed("tqdm"): import tqdm from tqdm.contrib.logging import logging_redirect_tqdm @@ -56,7 +55,7 @@ def optimize(self, options: Optional[FusionOptions] = None, add_dynamic_axes: bo logger.info("tqdm is not installed. Run optimization without progress bar") self._optimize(options, None) - def _optimize(self, options: Optional[FusionOptions] = None, progress_bar=None): + def _optimize(self, options: FusionOptions | None = None, progress_bar=None): if (options is not None) and not options.enable_shape_inference: self.disable_shape_inference() diff --git a/onnxruntime/python/tools/transformers/onnx_model_t5.py b/onnxruntime/python/tools/transformers/onnx_model_t5.py index 70742bb5f52e3..33dcc7795a465 100644 --- a/onnxruntime/python/tools/transformers/onnx_model_t5.py +++ b/onnxruntime/python/tools/transformers/onnx_model_t5.py @@ -3,7 +3,6 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- import logging -from typing import Optional, Union import numpy as np from fusion_attention import AttentionMask, FusionAttention @@ -50,8 +49,8 @@ def create_attention_node( input: str, output: str, add_qk_str: str, - scale: Optional[float] = None, - ) -> Union[NodeProto, None]: + scale: float | None = None, + ) -> NodeProto | None: """Create an Attention node. Args: mask_index (str): mask input @@ -163,7 +162,7 @@ def create_mha_node( present_value: str, num_heads: int, hidden_size: int, - ) -> Union[NodeProto, None]: + ) -> NodeProto | None: assert num_heads > 0 if hidden_size > 0 and (hidden_size % num_heads) != 0: diff --git a/onnxruntime/python/tools/transformers/onnx_model_tnlr.py b/onnxruntime/python/tools/transformers/onnx_model_tnlr.py index f5a47b19d67fc..125aa47a7dbed 100644 --- a/onnxruntime/python/tools/transformers/onnx_model_tnlr.py +++ b/onnxruntime/python/tools/transformers/onnx_model_tnlr.py @@ -3,7 +3,6 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- import logging -from typing import Union from fusion_attention import AttentionMask, FusionAttention from fusion_utils import NumpyHelper @@ -39,7 +38,7 @@ def create_attention_node( input: str, output: str, add_qk_str: str, - ) -> Union[NodeProto, None]: + ) -> NodeProto | None: assert num_heads > 0 if hidden_size > 0 and (hidden_size % num_heads) != 0: logger.debug(f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}") diff --git a/onnxruntime/python/tools/transformers/onnx_model_unet.py b/onnxruntime/python/tools/transformers/onnx_model_unet.py index 77e24986f0fde..e96cf32927171 100644 --- a/onnxruntime/python/tools/transformers/onnx_model_unet.py +++ b/onnxruntime/python/tools/transformers/onnx_model_unet.py @@ -4,7 +4,6 @@ # -------------------------------------------------------------------------- import logging -from typing import Optional from fusion_attention_unet import FusionAttentionUnet from fusion_bias_add import FusionBiasAdd @@ -91,7 +90,7 @@ def merge_adjacent_transpose(self): if total: logger.info("Removed %d Transpose nodes", total) - def fuse_multi_head_attention(self, options: Optional[FusionOptions] = None): + def fuse_multi_head_attention(self, options: FusionOptions | None = None): # Self Attention enable_packed_qkv = (options is None) or options.enable_packed_qkv self_attention_fusion = FusionAttentionUnet( @@ -120,7 +119,7 @@ def fuse_bias_add(self): fusion = FusionBiasAdd(self) fusion.apply() - def optimize(self, options: Optional[FusionOptions] = None): + def optimize(self, options: FusionOptions | None = None): if is_installed("tqdm"): import tqdm from tqdm.contrib.logging import logging_redirect_tqdm @@ -133,7 +132,7 @@ def optimize(self, options: Optional[FusionOptions] = None): logger.info("tqdm is not installed. Run optimization without progress bar") self._optimize(options, None) - def _optimize(self, options: Optional[FusionOptions] = None, progress_bar=None): + def _optimize(self, options: FusionOptions | None = None, progress_bar=None): if (options is not None) and not options.enable_shape_inference: self.disable_shape_inference() diff --git a/onnxruntime/python/tools/transformers/onnx_model_vae.py b/onnxruntime/python/tools/transformers/onnx_model_vae.py index de8b59074a871..1e531bbc3eff3 100644 --- a/onnxruntime/python/tools/transformers/onnx_model_vae.py +++ b/onnxruntime/python/tools/transformers/onnx_model_vae.py @@ -4,7 +4,6 @@ # -------------------------------------------------------------------------- from logging import getLogger -from typing import Optional from fusion_attention_vae import FusionAttentionVae from fusion_options import FusionOptions @@ -19,7 +18,7 @@ def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0): assert (num_heads == 0 and hidden_size == 0) or (num_heads > 0 and hidden_size % num_heads == 0) super().__init__(model, num_heads=num_heads, hidden_size=hidden_size) - def fuse_multi_head_attention(self, options: Optional[FusionOptions] = None): + def fuse_multi_head_attention(self, options: FusionOptions | None = None): # Self Attention self_attention_fusion = FusionAttentionVae(self, self.hidden_size, self.num_heads) self_attention_fusion.apply() diff --git a/onnxruntime/python/tools/transformers/onnx_utils.py b/onnxruntime/python/tools/transformers/onnx_utils.py index 64fade9369395..7f681d783cb64 100644 --- a/onnxruntime/python/tools/transformers/onnx_utils.py +++ b/onnxruntime/python/tools/transformers/onnx_utils.py @@ -35,7 +35,7 @@ def extract_raw_data_from_model(model: ModelProto): initializer.name = name initializer.ClearField("raw_data") - return zip(*external_data) + return zip(*external_data, strict=False) def has_external_data(model: ModelProto): diff --git a/onnxruntime/python/tools/transformers/optimizer.py b/onnxruntime/python/tools/transformers/optimizer.py index a83c54e345d7d..c4d187e8bf031 100644 --- a/onnxruntime/python/tools/transformers/optimizer.py +++ b/onnxruntime/python/tools/transformers/optimizer.py @@ -22,7 +22,6 @@ import os import tempfile from pathlib import Path -from typing import Dict, List, Optional, Union import coloredlogs from fusion_options import FusionOptions @@ -72,17 +71,17 @@ def optimize_by_onnxruntime( - onnx_model: Optional[Union[str, ModelProto]] = None, + onnx_model: str | ModelProto | None = None, use_gpu: bool = False, - optimized_model_path: Optional[str] = None, - opt_level: Optional[int] = 99, - disabled_optimizers: List[str] = [], # noqa: B006 + optimized_model_path: str | None = None, + opt_level: int | None = 99, + disabled_optimizers: list[str] = [], # noqa: B006 verbose: bool = False, save_as_external_data: bool = False, external_data_filename: str = "", external_data_file_threshold: int = 1024, *, - provider: Optional[str] = None, + provider: str | None = None, **deprecated_kwargs, ) -> str: """ @@ -217,7 +216,7 @@ def optimize_by_fusion( model_type: str = "bert", num_heads: int = 0, hidden_size: int = 0, - optimization_options: Optional[FusionOptions] = None, + optimization_options: FusionOptions | None = None, ) -> OnnxModel: """Optimize Model by graph fusion logic. @@ -274,17 +273,17 @@ def optimize_by_fusion( def optimize_model( - input: Union[str, ModelProto], + input: str | ModelProto, model_type: str = "bert", num_heads: int = 0, hidden_size: int = 0, - optimization_options: Optional[FusionOptions] = None, - opt_level: Optional[int] = None, + optimization_options: FusionOptions | None = None, + opt_level: int | None = None, use_gpu: bool = False, only_onnxruntime: bool = False, verbose: bool = False, *, - provider: Optional[str] = None, + provider: str | None = None, ) -> OnnxModel: """Optimize Model by OnnxRuntime and/or python fusion logic. @@ -414,7 +413,7 @@ def optimize_model( return optimizer -def get_fusion_statistics(optimized_model_path: str) -> Dict[str, int]: +def get_fusion_statistics(optimized_model_path: str) -> dict[str, int]: """ Get counter of fused operators in optimized model. diff --git a/onnxruntime/python/tools/transformers/shape_infer_helper.py b/onnxruntime/python/tools/transformers/shape_infer_helper.py index f1fc0c952e8e4..f4d65d05ad0c8 100644 --- a/onnxruntime/python/tools/transformers/shape_infer_helper.py +++ b/onnxruntime/python/tools/transformers/shape_infer_helper.py @@ -6,7 +6,6 @@ import logging import os import sys -from typing import Dict # In ORT Package the symbolic_shape_infer.py is in ../tools file_path = os.path.dirname(__file__) @@ -26,9 +25,9 @@ def __init__(self, model, verbose=0, int_max=2**31 - 1, auto_merge=True, guess_o self.model_ = model self.all_shapes_inferred_: bool = False self.is_inferred_: bool = False - self.dynamic_axis_mapping_: Dict[str, int] = {} + self.dynamic_axis_mapping_: dict[str, int] = {} - def infer(self, dynamic_axis_mapping: Dict[str, int], max_runs: int = 200): + def infer(self, dynamic_axis_mapping: dict[str, int], max_runs: int = 200): """Run shape inference, and try replace dynamic axis from string to integer when mapping is provided. Args: diff --git a/onnxruntime/python/tools/transformers/shape_optimizer.py b/onnxruntime/python/tools/transformers/shape_optimizer.py index 17fd54f19baf2..9f590dfb86911 100644 --- a/onnxruntime/python/tools/transformers/shape_optimizer.py +++ b/onnxruntime/python/tools/transformers/shape_optimizer.py @@ -16,7 +16,6 @@ from collections import deque # noqa: F401 from datetime import datetime from pathlib import Path # noqa: F401 -from typing import List, Optional import numpy as np import onnx @@ -271,7 +270,7 @@ def validate_input(self, input: str): valid_names = [input.name for input in self.model.graph.input] raise Exception(f"Input {input} does not exist in the graph inputs: {valid_names}") - def validate_outputs(self, output_names: List[str]): + def validate_outputs(self, output_names: list[str]): valid_names = [output.name for output in self.model.graph.output] for name in output_names: if name not in valid_names: @@ -285,7 +284,7 @@ def optimize( input_mask: str, enable_shape_opt: bool, enable_reshape_opt: bool, - output_names: Optional[List[str]] = None, + output_names: list[str] | None = None, batch_size=1, sequence_length=128, verbose=False, diff --git a/onnxruntime/test/contrib_ops/multihead_attention_op_test_data_gen.py b/onnxruntime/test/contrib_ops/multihead_attention_op_test_data_gen.py index bdb0ffc6c50db..52ce2ef5fdef1 100644 --- a/onnxruntime/test/contrib_ops/multihead_attention_op_test_data_gen.py +++ b/onnxruntime/test/contrib_ops/multihead_attention_op_test_data_gen.py @@ -7,7 +7,6 @@ # CUBLAS_WORKSPACE_CONFIG=:4096:8 python multihead_attention_op_test_data_gen.py import math -from typing import Optional, Tuple import numpy as np import torch @@ -56,12 +55,12 @@ def get_extended_attention_mask(self, attention_mask: Tensor, dtype: torch.dtype def forward( self, hidden_states: torch.Tensor, - attention_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, - output_attentions: Optional[bool] = False, - ) -> Tuple[torch.Tensor]: + attention_mask: torch.FloatTensor | None = None, + encoder_hidden_states: torch.FloatTensor | None = None, + encoder_attention_mask: torch.FloatTensor | None = None, + past_key_value: tuple[tuple[torch.FloatTensor]] | None = None, + output_attentions: bool | None = False, + ) -> tuple[torch.Tensor]: mixed_query_layer = self.query(hidden_states) if self.verbose: print("q", mixed_query_layer) diff --git a/onnxruntime/test/providers/cpu/rnn/LSTM.py b/onnxruntime/test/providers/cpu/rnn/LSTM.py index 49e28a93385a4..472fa5f844ac0 100644 --- a/onnxruntime/test/providers/cpu/rnn/LSTM.py +++ b/onnxruntime/test/providers/cpu/rnn/LSTM.py @@ -2,13 +2,7 @@ # Licensed under the MIT License. -from typing import Any, Tuple # noqa: F401 - -import numpy as np # type: ignore - -# import onnx -# from ..base import Base -# from . import expect +import numpy as np DebugOutput = True np.set_printoptions(suppress=True) # , precision=16, floatmode='maxprec') diff --git a/onnxruntime/test/providers/cpu/tensor/affine_grid_test_gen.py b/onnxruntime/test/providers/cpu/tensor/affine_grid_test_gen.py index 796a58f1a929c..5276b70789db1 100644 --- a/onnxruntime/test/providers/cpu/tensor/affine_grid_test_gen.py +++ b/onnxruntime/test/providers/cpu/tensor/affine_grid_test_gen.py @@ -24,7 +24,7 @@ test_count = 0 for align_corners in align_corners_options: - for angle, translation, scale in zip(angles, translations, scales): + for angle, translation, scale in zip(angles, translations, scales, strict=False): for size in sizes: theta = np.array([], dtype=np.float32) for _ in range(size[0]): @@ -71,7 +71,7 @@ test_count = 0 for align_corners in align_corners_options: - for angle, translation, scale in zip(angles, translations, scales): + for angle, translation, scale in zip(angles, translations, scales, strict=False): for size in sizes: theta = np.array([], dtype=np.float32) for _ in range(size[0]): diff --git a/onnxruntime/test/python/contrib_ops/onnx_contrib_ops_helper.py b/onnxruntime/test/python/contrib_ops/onnx_contrib_ops_helper.py index dd5d5cc90e0bf..1459dfc61c84c 100644 --- a/onnxruntime/test/python/contrib_ops/onnx_contrib_ops_helper.py +++ b/onnxruntime/test/python/contrib_ops/onnx_contrib_ops_helper.py @@ -65,11 +65,11 @@ def expect( del kwargs["output_types"] inputs_vi = [ _extract_value_info(arr, arr_name, input_type) - for arr, arr_name, input_type in zip(inputs, present_inputs, input_types) + for arr, arr_name, input_type in zip(inputs, present_inputs, input_types, strict=False) ] outputs_vi = [ _extract_value_info(arr, arr_name, output_type) - for arr, arr_name, output_type in zip(outputs, present_outputs, output_types) + for arr, arr_name, output_type in zip(outputs, present_outputs, output_types, strict=False) ] graph = onnx.helper.make_graph(nodes=[node], name=name, inputs=inputs_vi, outputs=outputs_vi) diff --git a/onnxruntime/test/python/onnx_backend_test_series.py b/onnxruntime/test/python/onnx_backend_test_series.py index 8fc76da3495a8..23f6d3e23e9bf 100644 --- a/onnxruntime/test/python/onnx_backend_test_series.py +++ b/onnxruntime/test/python/onnx_backend_test_series.py @@ -9,7 +9,6 @@ import re import sys import unittest -from typing import Dict import numpy as np import onnx @@ -28,8 +27,8 @@ class OrtBackendTest(onnx.backend.test.runner.Runner): # pylint: disable=too-few-public-methods def __init__( self, - rtol_overrides: Dict[str, float], - atol_overrides: Dict[str, float], + rtol_overrides: dict[str, float], + atol_overrides: dict[str, float], ): self._rtol_overrides = rtol_overrides self._atol_overrides = atol_overrides diff --git a/onnxruntime/test/python/onnxruntime_test_distributed.py b/onnxruntime/test/python/onnxruntime_test_distributed.py index de70478761f19..7f4f4b5bb2270 100644 --- a/onnxruntime/test/python/onnxruntime_test_distributed.py +++ b/onnxruntime/test/python/onnxruntime_test_distributed.py @@ -2,7 +2,6 @@ # Licensed under the MIT License. import unittest -from typing import Tuple import numpy as np import onnxscript @@ -23,7 +22,7 @@ def shard_tensor_per_device_mesh(X, rank, axis, device_mesh): if axis is None: return X shards = np.split(X, len(device_mesh), axis) - selected_shards = tuple(shard for device_id, shard in zip(device_mesh, shards) if device_id == rank) + selected_shards = tuple(shard for device_id, shard in zip(device_mesh, shards, strict=False) if device_id == rank) return np.concatenate(selected_shards, axis=axis) @@ -99,12 +98,12 @@ def shard_tensor_per_spec(tensor: np.ndarray, rank: int, spec: str, device_mesh: class TestDistributedReshape(unittest.TestCase): def _check_distributed_reshape( self, - shape: Tuple[int, ...], - target_shape: Tuple[int, ...], + shape: tuple[int, ...], + target_shape: tuple[int, ...], input_device_meshes: np.ndarray, - input_shard_specs: Tuple[str, ...], + input_shard_specs: tuple[str, ...], output_device_meshes: np.ndarray, - output_shard_specs: Tuple[str, ...], + output_shard_specs: tuple[str, ...], ): input_device_mesh_shapes, input_device_mesh_elements = translate_all_device_meshes(input_device_meshes) output_device_mesh_shapes, output_device_mesh_elements = translate_all_device_meshes(output_device_meshes) @@ -683,12 +682,12 @@ def test_reshape_two_axis_fusion_shape_3_7_4096_rrs_01_shape_21_4906_rs_01(self) class TestDistributedExpand(unittest.TestCase): def _check_distributed_expand( self, - shape: Tuple[int, ...], - target_shape: Tuple[int, ...], + shape: tuple[int, ...], + target_shape: tuple[int, ...], input_device_meshes: np.ndarray, - input_shard_specs: Tuple[str, ...], + input_shard_specs: tuple[str, ...], output_device_meshes: np.ndarray, - output_shard_specs: Tuple[str, ...], + output_shard_specs: tuple[str, ...], ): assert len(input_device_meshes) == len(input_shard_specs) assert len(output_device_meshes) == len(output_shard_specs) @@ -855,12 +854,12 @@ def test_expand_in_tiny_llama(self): class TestDistributedUnsqueeze(unittest.TestCase): def _check_distributed_unsqueeze( self, - shape: Tuple[int, ...], - axes: Tuple[int, ...], + shape: tuple[int, ...], + axes: tuple[int, ...], input_device_meshes: np.ndarray, - input_shard_specs: Tuple[str, ...], + input_shard_specs: tuple[str, ...], output_device_meshes: np.ndarray, - output_shard_specs: Tuple[str, ...], + output_shard_specs: tuple[str, ...], ): assert len(input_device_meshes) == len(input_shard_specs) assert len(output_device_meshes) == len(output_shard_specs) @@ -977,12 +976,12 @@ def test_unsqueeze_not_sharded(self): class TestDistributedSqueeze(unittest.TestCase): def _check_distributed_squeeze( self, - shape: Tuple[int, ...], - axes: Tuple[int, ...], + shape: tuple[int, ...], + axes: tuple[int, ...], input_device_meshes: np.ndarray, - input_shard_specs: Tuple[str, ...], + input_shard_specs: tuple[str, ...], output_device_meshes: np.ndarray, - output_shard_specs: Tuple[str, ...], + output_shard_specs: tuple[str, ...], ): assert len(input_device_meshes) == len(input_shard_specs) assert len(output_device_meshes) == len(output_shard_specs) @@ -1086,12 +1085,12 @@ def _check_distributed_reduce( self, keepdims: int, dtype: np.dtype, - shape: Tuple[int, ...], - axes: Tuple[int, ...], + shape: tuple[int, ...], + axes: tuple[int, ...], input_device_meshes: np.ndarray, - input_shard_specs: Tuple[str, ...], + input_shard_specs: tuple[str, ...], output_device_meshes: np.ndarray, - output_shard_specs: Tuple[str, ...], + output_shard_specs: tuple[str, ...], ): assert len(input_device_meshes) == len(input_shard_specs) assert len(output_device_meshes) == len(output_shard_specs) @@ -1146,6 +1145,7 @@ def distributed_reduce_mean_instance(data_tensor: FLOAT, axes_tensor: INT64): for onnx_func, np_func in zip( [distributed_reduce_sum_instance, distributed_reduce_max_instance, distributed_reduce_mean_instance], [np.sum, np.maximum.reduce, np.mean], + strict=False, ): data = np.random.randint(4, size=shape).astype(dtype) expected = np_func(data, axis=axes, keepdims=bool(keepdims)) diff --git a/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py b/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py index ce04dff2aecb0..5ab2fe8939f6a 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py +++ b/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py @@ -2,7 +2,6 @@ # Licensed under the MIT License. import unittest -from typing import Dict, List import numpy as np from helper import get_name @@ -14,7 +13,7 @@ class CudaGraphHelper: def __init__( self, ort_session: onnxrt.InferenceSession, - input_and_output_shape: Dict[str, List[int]], + input_and_output_shape: dict[str, list[int]], device_id: int = 0, ): self.input_names = [input.name for input in ort_session.get_inputs()] @@ -52,7 +51,7 @@ def get_io_numpy_type_map(self, ort_session: onnxrt.InferenceSession): return name_to_numpy_type - def update_inputs(self, inputs: Dict[str, np.ndarray]): + def update_inputs(self, inputs: dict[str, np.ndarray]): for input_name in self.input_names: self.io_ort_value[input_name].update_inplace(inputs[input_name]) diff --git a/onnxruntime/test/python/onnxruntime_test_python_dmlgraph.py b/onnxruntime/test/python/onnxruntime_test_python_dmlgraph.py index 29292c2a777b1..033eae1cb4c8d 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_dmlgraph.py +++ b/onnxruntime/test/python/onnxruntime_test_python_dmlgraph.py @@ -2,7 +2,6 @@ # Licensed under the MIT License. import unittest -from typing import Dict, List import numpy as np from helper import get_name @@ -14,7 +13,7 @@ class DmlGraphHelper: def __init__( self, ort_session: onnxrt.InferenceSession, - input_and_output_shape: Dict[str, List[int]], + input_and_output_shape: dict[str, list[int]], device_id: int = 0, ): self.input_names = [input.name for input in ort_session.get_inputs()] @@ -52,7 +51,7 @@ def get_io_numpy_type_map(self, ort_session: onnxrt.InferenceSession): return name_to_numpy_type - def update_inputs(self, inputs: Dict[str, np.ndarray]): + def update_inputs(self, inputs: dict[str, np.ndarray]): for input_name in self.input_names: self.io_ort_value[input_name].update_inplace(inputs[input_name]) diff --git a/onnxruntime/test/python/onnxruntime_test_python_nested_control_flow_op.py b/onnxruntime/test/python/onnxruntime_test_python_nested_control_flow_op.py index bf354ad9f9e10..0a311245dd2b5 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_nested_control_flow_op.py +++ b/onnxruntime/test/python/onnxruntime_test_python_nested_control_flow_op.py @@ -2,8 +2,8 @@ # Licensed under the MIT License. import unittest +from collections.abc import Sequence from copy import deepcopy -from typing import Optional, Sequence, Tuple import numpy as np from onnx import ModelProto, NodeProto, TensorProto, ValueInfoProto, checker, helper @@ -31,7 +31,7 @@ def make_optional_tensor_value_info(name: str, elem_type: int, shape: Sequence[i return vi -def make_optional_vi(vi: ValueInfoProto, name: Optional[str] = None) -> ValueInfoProto: +def make_optional_vi(vi: ValueInfoProto, name: str | None = None) -> ValueInfoProto: """Makes a copy of `vi` with optional type.""" name = name or vi.name + ".opt" vi_type = vi.type.tensor_type @@ -40,7 +40,7 @@ def make_optional_vi(vi: ValueInfoProto, name: Optional[str] = None) -> ValueInf return opt_vi -def make_const(vi: ValueInfoProto, name: str, value: int = 0) -> Tuple[ValueInfoProto, NodeProto, TensorProto]: +def make_const(vi: ValueInfoProto, name: str, value: int = 0) -> tuple[ValueInfoProto, NodeProto, TensorProto]: """Creates a constant 1D tensor from `vi`.""" const_vi = make_vi_like(vi, name) const_shape = [d.dim_value for d in vi.type.tensor_type.shape.dim] diff --git a/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py b/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py index 2f8fb84c4c651..92d6d758eef4d 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py +++ b/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py @@ -120,7 +120,7 @@ def _check_shapes(self, graph, inferred_graph, vis): # type: (GraphProto, Graph vis_names = {x.name for x in vis} inferred_vis_names = {x.name for x in inferred_vis} assert vis_names == inferred_vis_names, (vis_names, inferred_vis_names) - for vi, inferred_vi in zip(vis, inferred_vis): + for vi, inferred_vi in zip(vis, inferred_vis, strict=False): assert vi == inferred_vi, f"\n{vi}\n{inferred_vi}\n" raise AssertionError() diff --git a/onnxruntime/test/python/onnxruntime_test_scatternd.py b/onnxruntime/test/python/onnxruntime_test_scatternd.py index e75c04dfb9965..42f706d1eec0f 100644 --- a/onnxruntime/test/python/onnxruntime_test_scatternd.py +++ b/onnxruntime/test/python/onnxruntime_test_scatternd.py @@ -19,7 +19,7 @@ def has_cuda(): return "CUDAExecutionProvider" in available_providers -def ignore_warnings(warns: typing.List[Warning]) -> typing.Callable: +def ignore_warnings(warns: list[Warning]) -> typing.Callable: def wrapper(fct): if warns is None: raise AssertionError(f"warns cannot be None for '{fct}'.") diff --git a/onnxruntime/test/python/quantization/test_calibration.py b/onnxruntime/test/python/quantization/test_calibration.py index 5856ec44bc85f..60c5f9d404258 100644 --- a/onnxruntime/test/python/quantization/test_calibration.py +++ b/onnxruntime/test/python/quantization/test_calibration.py @@ -358,9 +358,9 @@ def test_compute_data(self): rmin = np.minimum(rmin, np.amin(output, axis=1)) rmax = np.maximum(rmax, np.amax(output, axis=1)) - min_max_pairs = list(zip(rmin, rmax)) + min_max_pairs = list(zip(rmin, rmax, strict=False)) output_names = [infer_session.get_outputs()[i].name for i in range(len(infer_session.get_outputs()))] - output_min_max_dict = dict(zip(output_names, min_max_pairs)) + output_min_max_dict = dict(zip(output_names, min_max_pairs, strict=False)) for output_name, min_max in output_min_max_dict.items(): self.assertEqual(min_max, tensors_range[output_name].range_value) @@ -521,9 +521,9 @@ def test_compute_data_per_channel(self): rmin = np.minimum(rmin, np.amin(output, axis=-1)) rmax = np.maximum(rmax, np.amax(output, axis=-1)) - min_max_pairs = list(zip(rmin, rmax)) + min_max_pairs = list(zip(rmin, rmax, strict=False)) output_names = [infer_session.get_outputs()[i].name for i in range(len(infer_session.get_outputs()))] - output_min_max_dict = dict(zip(output_names, min_max_pairs)) + output_min_max_dict = dict(zip(output_names, min_max_pairs, strict=False)) for output_name, min_max in output_min_max_dict.items(): np.testing.assert_equal(min_max, tensors_range[output_name].range_value) diff --git a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py index 292dc50124c16..ed0c65cba78ac 100644 --- a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py +++ b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py @@ -9,7 +9,6 @@ import unittest from importlib.util import find_spec from pathlib import Path -from typing import Dict, Tuple, Union import numpy as np import onnx @@ -28,7 +27,7 @@ def setUpClass(cls): def tearDownClass(cls): cls._tmp_model_dir.cleanup() - def fill_int4_data(self, shape: Union[int, Tuple[int, ...]], symmetric: bool) -> np.ndarray: + def fill_int4_data(self, shape: int | tuple[int, ...], symmetric: bool) -> np.ndarray: line = np.zeros(shape) line = line.reshape(-1) @@ -54,7 +53,7 @@ def fill_int4_data(self, shape: Union[int, Tuple[int, ...]], symmetric: bool) -> def input_feeds( self, n: int, - name2shape: Dict[str, Union[int, Tuple[int, ...]]], + name2shape: dict[str, int | tuple[int, ...]], low: int = -1, high: int = 2, dtype: type = np.float32, @@ -79,7 +78,7 @@ def construct_model_matmul(self, output_model_path: str, symmetric: bool) -> Non initializers = [] def make_matmul( - input_name, weight_shape: Union[int, Tuple[int, ...]], weight_name: str, output_name: str, node_name: str + input_name, weight_shape: int | tuple[int, ...], weight_name: str, output_name: str, node_name: str ): weight_data = self.fill_int4_data(weight_shape, symmetric).astype(np.float32) initializers.append(onnx.numpy_helper.from_array(weight_data, name=weight_name)) @@ -137,7 +136,7 @@ def construct_model_gather( initializers = [] def make_gather( - indices_name, data_shape: Union[int, Tuple[int, ...]], data_name: str, output_name: str, node_name: str + indices_name, data_shape: int | tuple[int, ...], data_name: str, output_name: str, node_name: str ): weight_data = self.fill_int4_data(data_shape, symmetric).astype( np.float32 if tdata == TensorProto.FLOAT else np.float16 @@ -184,8 +183,8 @@ def quant_test( block_size: int, is_symmetric: bool, quant_format: quant_utils.QuantFormat = quant_utils.QuantFormat.QOperator, - op_types_to_quantize: Tuple[str, ...] = ("MatMul",), - quant_axes: Tuple[Tuple[str, int], ...] = (("MatMul", 0), ("Gather", 1)), + op_types_to_quantize: tuple[str, ...] = ("MatMul",), + quant_axes: tuple[tuple[str, int], ...] = (("MatMul", 0), ("Gather", 1)), rtol: float = 0.01, atol: float = 0.05, ): diff --git a/onnxruntime/test/python/quantization/test_op_matmul_bnb4.py b/onnxruntime/test/python/quantization/test_op_matmul_bnb4.py index 88432d75c653e..d32abc1476600 100644 --- a/onnxruntime/test/python/quantization/test_op_matmul_bnb4.py +++ b/onnxruntime/test/python/quantization/test_op_matmul_bnb4.py @@ -9,7 +9,6 @@ import unittest from importlib.util import find_spec from pathlib import Path -from typing import Dict, Tuple, Union import numpy as np import onnx @@ -67,7 +66,7 @@ def setUpClass(cls): def tearDownClass(cls): cls._tmp_model_dir.cleanup() - def fill_bnb4_data(self, shape: Tuple[int, int], quant_type: int) -> np.ndarray: + def fill_bnb4_data(self, shape: tuple[int, int], quant_type: int) -> np.ndarray: rows, cols = shape line = np.zeros(shape) line = line.reshape(-1) @@ -84,7 +83,7 @@ def fill_bnb4_data(self, shape: Tuple[int, int], quant_type: int) -> np.ndarray: line = line.reshape(cols, rows).transpose() return line.reshape(shape) - def input_feeds(self, n: int, name2shape: Dict[str, Union[int, Tuple[int, ...]]]) -> TestDataFeeds: + def input_feeds(self, n: int, name2shape: dict[str, int | tuple[int, ...]]) -> TestDataFeeds: input_data_list = [] for _i in range(n): inputs = {} @@ -104,7 +103,7 @@ def construct_model_matmul(self, output_model_path: str, quant_type: int) -> Non output_name = "output" initializers = [] - def make_matmul(input_name, weight_shape: Union[int, Tuple[int, ...]], weight_name: str, output_name: str): + def make_matmul(input_name, weight_shape: int | tuple[int, ...], weight_name: str, output_name: str): weight_data = self.fill_bnb4_data(weight_shape, quant_type).astype(np.float32) initializers.append(onnx.numpy_helper.from_array(weight_data, name=weight_name)) return onnx.helper.make_node( diff --git a/onnxruntime/test/python/quantization/test_op_pad.py b/onnxruntime/test/python/quantization/test_op_pad.py index 755c7fae5e3e8..28dc8f4b7dee7 100644 --- a/onnxruntime/test/python/quantization/test_op_pad.py +++ b/onnxruntime/test/python/quantization/test_op_pad.py @@ -54,7 +54,7 @@ def construct_model_pad( input_tensor = helper.make_tensor_value_info("input", TensorProto.FLOAT, pad_input_shape) pad_dims_initializer = helper.make_tensor("pad_dims", TensorProto.INT64, [2 * rank], pad_dims) - output_shape = [sum(e) for e in list(zip(pad_input_shape, pad_dims[:rank], pad_dims[rank:]))] + output_shape = [sum(e) for e in list(zip(pad_input_shape, pad_dims[:rank], pad_dims[rank:], strict=False))] output_tensor = helper.make_tensor_value_info("output", TensorProto.FLOAT, output_shape) inputs = ["input", "pad_dims"] @@ -108,7 +108,7 @@ def construct_model_conv_pad( identity_node = helper.make_node("Identity", ["conv_output"], ["identity_out"], name="IdentityNode") pad_dims_initializer = helper.make_tensor("pad_dims", TensorProto.INT64, [2 * rank], pad_dims) - output_shape = [sum(e) for e in list(zip(pad_input_shape, pad_dims[:rank], pad_dims[rank:]))] + output_shape = [sum(e) for e in list(zip(pad_input_shape, pad_dims[:rank], pad_dims[rank:], strict=False))] output_tensor = helper.make_tensor_value_info("output", TensorProto.FLOAT, output_shape) pad_inputs = ["conv_output", "pad_dims"] initializers = [conv_weight_initializer, pad_dims_initializer] @@ -385,7 +385,7 @@ def construct_edge_case_model( identity_node = helper.make_node("Identity", ["conv_output"], ["identity_out"], name="IdentityNode") pad_dims_initializer = helper.make_tensor("pad_dims", TensorProto.INT64, [2 * rank], pad_dims) - output_shape = [sum(e) for e in list(zip(pad_input_shape, pad_dims[:rank], pad_dims[rank:]))] + output_shape = [sum(e) for e in list(zip(pad_input_shape, pad_dims[:rank], pad_dims[rank:], strict=False))] output_tensor = helper.make_tensor_value_info("output", TensorProto.FLOAT, output_shape) pad_inputs = ["conv_output", "pad_dims"] initializers = [conv_weight_initializer, pad_dims_initializer] diff --git a/onnxruntime/test/python/quantization/test_qdq_loss_debug.py b/onnxruntime/test/python/quantization/test_qdq_loss_debug.py index e9108f157f953..5d70641547eae 100644 --- a/onnxruntime/test/python/quantization/test_qdq_loss_debug.py +++ b/onnxruntime/test/python/quantization/test_qdq_loss_debug.py @@ -9,7 +9,6 @@ import tempfile import unittest from pathlib import Path -from typing import Dict, List import numpy as np import onnx @@ -108,7 +107,7 @@ def rewind(self): def augment_model_collect_activations( model_path: str, augmented_model_path: str, data_reader: TestDataReader -) -> Dict[str, List[np.ndarray]]: +) -> dict[str, list[np.ndarray]]: modify_model_output_intermediate_tensors(model_path, augmented_model_path) tensor_dict = collect_activations(augmented_model_path, data_reader) @@ -149,12 +148,12 @@ def test_saved_tensors_match_internal_tensors(self): output_dict = {} output_info = infer_session.get_outputs() for batch in oracle_outputs: - for output, output_data in zip(output_info, batch): + for output, output_data in zip(output_info, batch, strict=False): output_dict.setdefault(output.name, []).append(output_data) for output_name, model_outputs in output_dict.items(): test_outputs = tensor_dict[output_name] - for expected, actual in zip(model_outputs, test_outputs): + for expected, actual in zip(model_outputs, test_outputs, strict=False): exp = expected.reshape(-1) act = actual.reshape(-1) np.testing.assert_equal(exp, act) diff --git a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py index 5617a424cf4dc..be10575b535e4 100644 --- a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py +++ b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py @@ -420,7 +420,7 @@ def test_qdq_overrides_per_channel2(self): ) self.assertEqual(wgt_zp.data_type, quant_type.tensor_type) - for index, (zp, scale) in enumerate(zip(wgt_zp.int32_data, wgt_sc.float_data)): + for index, (zp, scale) in enumerate(zip(wgt_zp.int32_data, wgt_sc.float_data, strict=False)): wgt_qmin, wgt_qmax = get_qmin_qmax_for_qType( wgt_zp.data_type, symmetric=True, # per-channel is always symmetric diff --git a/onnxruntime/test/python/test_pytorch_export_contrib_ops.py b/onnxruntime/test/python/test_pytorch_export_contrib_ops.py index 5e20d6b4e692a..96a9aaad3c331 100644 --- a/onnxruntime/test/python/test_pytorch_export_contrib_ops.py +++ b/onnxruntime/test/python/test_pytorch_export_contrib_ops.py @@ -43,7 +43,10 @@ def to_numpy(tensor): assert len(outputs) == len(ort_outs), "number of outputs differ" # compare onnxruntime and PyTorch results - [np.testing.assert_allclose(out, ort_out, rtol=rtol, atol=atol) for out, ort_out in zip(outputs, ort_outs)] + [ + np.testing.assert_allclose(out, ort_out, rtol=rtol, atol=atol) + for out, ort_out in zip(outputs, ort_outs, strict=False) + ] # These set of tests verify ONNX model export and compares outputs between diff --git a/onnxruntime/test/python/transformers/benchmark_gqa.py b/onnxruntime/test/python/transformers/benchmark_gqa.py index 5cef4ae863a0e..41dbdf255f35c 100644 --- a/onnxruntime/test/python/transformers/benchmark_gqa.py +++ b/onnxruntime/test/python/transformers/benchmark_gqa.py @@ -7,13 +7,11 @@ Benchmark performance of GroupQueryAttention. """ -from typing import Optional - import torch from test_sparse_attention import GroupQueryAttentionConfig, OrtGroupQueryAttention -def get_plot_algos(sm: int, local_window_size: Optional[int]): +def get_plot_algos(sm: int, local_window_size: int | None): # GQA with local windows only works in sm=8x if sm >= 80 and local_window_size: return { @@ -37,7 +35,7 @@ def plot_prompt_performance( kv_num_heads: int, head_size: int, max_seq_len: int, - local_window_size: Optional[int] = None, + local_window_size: int | None = None, use_smooth_softmax: bool = False, ): import triton @@ -70,7 +68,7 @@ def benchmark( num_heads: int, kv_num_heads: int, head_size: int, - local_window_size: Optional[int] = None, + local_window_size: int | None = None, use_smooth_softmax: bool = False, device="cuda", ): @@ -107,7 +105,7 @@ def plot_token_performance( kv_num_heads: int, head_size: int, max_seq_len: int, - local_window_size: Optional[int] = None, + local_window_size: int | None = None, use_smooth_softmax: bool = False, ): import triton @@ -140,7 +138,7 @@ def benchmark( num_heads: int, kv_num_heads: int, head_size: int, - local_window_size: Optional[int] = None, + local_window_size: int | None = None, use_smooth_softmax: bool = False, device="cuda", ): diff --git a/onnxruntime/test/python/transformers/benchmark_gqa_windows.py b/onnxruntime/test/python/transformers/benchmark_gqa_windows.py index 79cc8e41bf343..97ff8f4b21a68 100644 --- a/onnxruntime/test/python/transformers/benchmark_gqa_windows.py +++ b/onnxruntime/test/python/transformers/benchmark_gqa_windows.py @@ -1,7 +1,6 @@ import argparse import os import time -from typing import Optional import torch from test_sparse_attention import GroupQueryAttentionConfig, OrtGroupQueryAttention @@ -36,7 +35,7 @@ def benchmark( max_seq_len: int, sequence_length: int = 1, past_sequence_length: int = 0, - local_window_size: Optional[int] = None, + local_window_size: int | None = None, use_smooth_softmax: bool = False, model_name: str = "Llama3-8B", ): diff --git a/onnxruntime/test/python/transformers/benchmark_mha.py b/onnxruntime/test/python/transformers/benchmark_mha.py index d922f153b4b91..d5bcabe0bf147 100644 --- a/onnxruntime/test/python/transformers/benchmark_mha.py +++ b/onnxruntime/test/python/transformers/benchmark_mha.py @@ -23,10 +23,10 @@ import sys import threading import time +from collections.abc import Callable from contextlib import nullcontext from datetime import datetime from enum import IntEnum -from typing import Callable, Dict, List, Optional, Tuple import torch import torch.utils.benchmark as benchmark @@ -56,7 +56,7 @@ def convert(format_str: str) -> int: return names.index(format_str) @staticmethod - def get_name_list() -> List[str]: + def get_name_list() -> list[str]: return ["Q,K,V", "QKV", "Q,KV", "Q,K',V'"] @@ -95,7 +95,7 @@ def __init__( max_cache_sequence_length=None, scale: float = 0.0, provider="CPUExecutionProvider", - device: Optional[torch.device] = None, + device: torch.device | None = None, enable_cuda_graph: bool = False, dtype=torch.float, use_kv_cache: bool = False, @@ -205,7 +205,7 @@ def __repr__(self): ) def shape_dict(self, input_format=None): - shapes: Dict[str, Tuple] = { + shapes: dict[str, tuple] = { "output": (self.batch_size, self.sequence_length, self.num_heads * self.head_size), } @@ -272,7 +272,7 @@ def shape_dict(self, input_format=None): return shapes def symbolic_shape_dict(self, input_format=None): - shapes: Dict[str, Tuple] = { + shapes: dict[str, tuple] = { "output": ("batch_size", "sequence_length", self.num_heads * self.head_size), } @@ -346,7 +346,7 @@ def right_side_padding_masks(self): ) if self.mask_format != AttentionMaskFormat.Mask_None: - for i, (m, n) in enumerate(zip(self.mask_index_q, self.mask_index_kv)): + for i, (m, n) in enumerate(zip(self.mask_index_q, self.mask_index_kv, strict=False)): q_mask[i, :, m:, :] = False k_mask[i, :, n:, :] = False mask[i, :, m:, :] = False @@ -660,7 +660,7 @@ def run_torch_sdpa( has_mask: bool = False, mask_dim: int = 2, mask_dtype=torch.bool, - backend: Optional[int] = None, + backend: int | None = None, repeats: int = 100, ): q_shape = (batch_size, num_heads, q_seq_len, head_size) diff --git a/onnxruntime/test/python/transformers/bert_model_generator.py b/onnxruntime/test/python/transformers/bert_model_generator.py index a84137f092e64..0bb71bd8736d4 100644 --- a/onnxruntime/test/python/transformers/bert_model_generator.py +++ b/onnxruntime/test/python/transformers/bert_model_generator.py @@ -5,7 +5,6 @@ # -------------------------------------------------------------------------- import math -from typing import List import numpy as np import onnx @@ -13,7 +12,7 @@ from packaging import version -def float_tensor(name: str, shape: List[int], random=False): +def float_tensor(name: str, shape: list[int], random=False): low = 0.0 high = 1.0 total_elements = 1 diff --git a/onnxruntime/test/python/transformers/conformer_model_generator.py b/onnxruntime/test/python/transformers/conformer_model_generator.py index 71e4f2b63cf4f..4e76478bfb649 100644 --- a/onnxruntime/test/python/transformers/conformer_model_generator.py +++ b/onnxruntime/test/python/transformers/conformer_model_generator.py @@ -4,7 +4,6 @@ # license information. # -------------------------------------------------------------------------- -from typing import List import numpy as np import onnx @@ -13,7 +12,7 @@ # Adapted from bert_model_generator.py -def get_tensor_and_weight(name: str, shape: List[int], random=False, zeros=False): +def get_tensor_and_weight(name: str, shape: list[int], random=False, zeros=False): low = 0.0 high = 1.0 total_elements = 1 diff --git a/onnxruntime/test/python/transformers/gpt2_model_generator.py b/onnxruntime/test/python/transformers/gpt2_model_generator.py index 0865c87b70da7..74136c2b8bc61 100644 --- a/onnxruntime/test/python/transformers/gpt2_model_generator.py +++ b/onnxruntime/test/python/transformers/gpt2_model_generator.py @@ -5,7 +5,6 @@ # -------------------------------------------------------------------------- import math -from typing import List # noqa: F401 import numpy import onnx diff --git a/onnxruntime/test/python/transformers/rotary_flash.py b/onnxruntime/test/python/transformers/rotary_flash.py index 4329b2c1a6057..a033805ec0d5e 100644 --- a/onnxruntime/test/python/transformers/rotary_flash.py +++ b/onnxruntime/test/python/transformers/rotary_flash.py @@ -1,8 +1,6 @@ # Copyright (c) 2023, Tri Dao. -from typing import Optional, Tuple, Union - import torch import triton import triton.language as tl @@ -142,9 +140,9 @@ def apply_rotary( x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, - seqlen_offsets: Union[int, torch.Tensor] = 0, - cu_seqlens: Optional[torch.Tensor] = None, - max_seqlen: Optional[int] = None, + seqlen_offsets: int | torch.Tensor = 0, + cu_seqlens: torch.Tensor | None = None, + max_seqlen: int | None = None, interleaved=False, inplace=False, conjugate=False, @@ -265,9 +263,9 @@ def forward( sin, interleaved=False, inplace=False, - seqlen_offsets: Union[int, torch.Tensor] = 0, - cu_seqlens: Optional[torch.Tensor] = None, - max_seqlen: Optional[int] = None, + seqlen_offsets: int | torch.Tensor = 0, + cu_seqlens: torch.Tensor | None = None, + max_seqlen: int | None = None, ): out = apply_rotary( x, @@ -321,9 +319,9 @@ def apply_rotary_emb( sin, interleaved=False, inplace=False, - seqlen_offsets: Union[int, torch.Tensor] = 0, - cu_seqlens: Optional[torch.Tensor] = None, - max_seqlen: Optional[int] = None, + seqlen_offsets: int | torch.Tensor = 0, + cu_seqlens: torch.Tensor | None = None, + max_seqlen: int | None = None, ): """ Arguments: @@ -360,7 +358,7 @@ def forward( cos_k=None, sin_k=None, interleaved=False, - seqlen_offsets: Union[int, torch.Tensor] = 0, + seqlen_offsets: int | torch.Tensor = 0, ): batch, seqlen, three, nheads, headdim = qkv.shape assert three == 3 @@ -432,7 +430,7 @@ def apply_rotary_emb_qkv_( cos_k=None, sin_k=None, interleaved=False, - seqlen_offsets: Union[int, torch.Tensor] = 0, + seqlen_offsets: int | torch.Tensor = 0, ): """ Arguments: @@ -453,7 +451,7 @@ def apply_rotary_emb_qkv_( class ApplyRotaryEmbKV(torch.autograd.Function): @staticmethod - def forward(ctx, kv, cos, sin, interleaved=False, seqlen_offsets: Union[int, torch.Tensor] = 0): + def forward(ctx, kv, cos, sin, interleaved=False, seqlen_offsets: int | torch.Tensor = 0): batch, seqlen, two, nheads, headdim = kv.shape assert two == 2 k = kv[:, :, 0] @@ -491,7 +489,7 @@ def apply_rotary_emb_kv_( cos, sin, interleaved=False, - seqlen_offsets: Union[int, torch.Tensor] = 0, + seqlen_offsets: int | torch.Tensor = 0, ): """ Arguments: @@ -623,10 +621,10 @@ def _update_cos_sin_cache(self, seqlen, device=None, dtype=None): def forward( self, qkv: torch.Tensor, - kv: Optional[torch.Tensor] = None, - seqlen_offset: Union[int, torch.Tensor] = 0, - max_seqlen: Optional[int] = None, - ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + kv: torch.Tensor | None = None, + seqlen_offset: int | torch.Tensor = 0, + max_seqlen: int | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: """ qkv: (batch, seqlen, 3, nheads, headdim) if kv is none, else it's just q of shape (batch, seqlen, nheads, headdim) diff --git a/onnxruntime/test/python/transformers/test_gemmfastgelu_fusion.py b/onnxruntime/test/python/transformers/test_gemmfastgelu_fusion.py index 431ae21cd5eaf..c4c136981e7a9 100644 --- a/onnxruntime/test/python/transformers/test_gemmfastgelu_fusion.py +++ b/onnxruntime/test/python/transformers/test_gemmfastgelu_fusion.py @@ -6,7 +6,6 @@ import os import unittest -from typing import List import numpy as np import onnx @@ -33,7 +32,7 @@ opsets = [onnxdomain, msdomain] -def float_tensor(name: str, shape: List[int], random=False): +def float_tensor(name: str, shape: list[int], random=False): low = 0.0 high = 1.0 total_elements = 1 diff --git a/onnxruntime/test/python/transformers/test_group_norm.py b/onnxruntime/test/python/transformers/test_group_norm.py index bf295a65c8b53..7a04df8b39c0d 100644 --- a/onnxruntime/test/python/transformers/test_group_norm.py +++ b/onnxruntime/test/python/transformers/test_group_norm.py @@ -7,7 +7,6 @@ from dataclasses import dataclass from enum import Enum from time import perf_counter -from typing import Optional, Tuple import numpy import torch @@ -215,11 +214,11 @@ def group_norm_ort( src: torch.Tensor, gamma: torch.Tensor, beta: torch.Tensor, - skip: Optional[torch.Tensor], - bias: Optional[torch.Tensor], + skip: torch.Tensor | None, + bias: torch.Tensor | None, config: GroupNormConfig, measure_latency=False, -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[float]]: +) -> tuple[torch.Tensor, torch.Tensor | None, float | None]: onnx_model_str = create_group_norm_graph(config) ort_session = InferenceSession(onnx_model_str, providers=["CUDAExecutionProvider"]) @@ -276,10 +275,10 @@ def group_norm_torch( src: torch.Tensor, gamma: torch.Tensor, beta: torch.Tensor, - skip: Optional[torch.Tensor], - bias: Optional[torch.Tensor], + skip: torch.Tensor | None, + bias: torch.Tensor | None, config: GroupNormConfig, -) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: +) -> tuple[torch.Tensor, torch.Tensor | None]: add_out = src if skip is not None: diff --git a/onnxruntime/test/python/transformers/test_mha.py b/onnxruntime/test/python/transformers/test_mha.py index 6f396f35f7146..dc19e3ec95243 100644 --- a/onnxruntime/test/python/transformers/test_mha.py +++ b/onnxruntime/test/python/transformers/test_mha.py @@ -11,7 +11,6 @@ import itertools import os import unittest -from typing import Dict, List, Optional import numpy import torch @@ -102,9 +101,9 @@ def attention_reference( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, - scale: Optional[float] = None, - attn_bias: Optional[torch.Tensor] = None, - mask: Optional[torch.Tensor] = None, + scale: float | None = None, + attn_bias: torch.Tensor | None = None, + mask: torch.Tensor | None = None, verbose: bool = False, ) -> torch.Tensor: """Reference implementation of SDPA @@ -171,14 +170,14 @@ def attention_reference( def mha_with_past_reference( config: MultiHeadAttentionConfig, - past_k: Optional[torch.Tensor], - past_v: Optional[torch.Tensor], + past_k: torch.Tensor | None, + past_v: torch.Tensor | None, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - scale: Optional[float] = None, - attn_bias: Optional[torch.Tensor] = None, - mask: Optional[torch.Tensor] = None, + scale: float | None = None, + attn_bias: torch.Tensor | None = None, + mask: torch.Tensor | None = None, ): assert config.kv_sequence_length == config.sequence_length assert config.use_kv_cache @@ -648,7 +647,7 @@ def parity_check_mha( def parity_check_mha_multi_threading( - test_inputs: List[Dict], + test_inputs: list[dict], rtol: float = 1e-3, atol: float = 1e-3, attention_kernel=SdpaKernel.DEFAULT, diff --git a/onnxruntime/test/python/transformers/test_parity_decoder_attention.py b/onnxruntime/test/python/transformers/test_parity_decoder_attention.py index e870e7f95fcee..8b4a68402f995 100644 --- a/onnxruntime/test/python/transformers/test_parity_decoder_attention.py +++ b/onnxruntime/test/python/transformers/test_parity_decoder_attention.py @@ -10,7 +10,6 @@ # license information. # ------------------------------------------------------------------------- -from typing import List, Optional, Tuple import numpy import torch @@ -118,7 +117,7 @@ def forward( self, query, key, - layer_state: Optional[List[Tensor]], + layer_state: list[Tensor] | None, encoder_decoder_attention: bool, use_past=torch.tensor(False), # noqa: B008 ): @@ -182,13 +181,13 @@ def forward( self, query, key: Tensor, - key_padding_mask: Optional[Tensor] = None, - layer_state: Optional[List[Tensor]] = None, - attn_mask: Optional[Tensor] = None, + key_padding_mask: Tensor | None = None, + layer_state: list[Tensor] | None = None, + attn_mask: Tensor | None = None, output_attentions: bool = False, use_past=torch.tensor(False), # noqa: B008 has_key_padding_mask: bool = False, - ) -> Tuple[Tensor, Optional[Tensor]]: + ) -> tuple[Tensor, Tensor | None]: """Input shape: Time(SeqLen) x Batch x Channel""" static_kv: bool = self.encoder_decoder_attention tgt_len, bsz, embed_dim = query.size() @@ -241,13 +240,13 @@ def ort_forward( self, query, key: Tensor, - key_padding_mask: Optional[Tensor] = None, - layer_state: Optional[List[Tensor]] = None, - attn_mask: Optional[Tensor] = None, + key_padding_mask: Tensor | None = None, + layer_state: list[Tensor] | None = None, + attn_mask: Tensor | None = None, output_attentions: bool = False, use_past=torch.tensor(False), # noqa: B008 has_key_padding_mask: bool = False, - ) -> Tuple[Tensor, Optional[Tensor]]: + ) -> tuple[Tensor, Tensor | None]: """Input shape: Time(SeqLen) x Batch x Channel""" # For readability static_kv = bool(self.encoder_decoder_attention) diff --git a/onnxruntime/test/python/transformers/test_rotary_embedding_fusion.py b/onnxruntime/test/python/transformers/test_rotary_embedding_fusion.py index 7bca48c29019e..89ef0342fab74 100644 --- a/onnxruntime/test/python/transformers/test_rotary_embedding_fusion.py +++ b/onnxruntime/test/python/transformers/test_rotary_embedding_fusion.py @@ -6,7 +6,6 @@ import os import sys import unittest -from typing import List import numpy as np import onnx @@ -23,7 +22,7 @@ from onnxruntime.transformers.optimizer import optimize_model -def float_tensor(name: str, shape: List[int], random=False): +def float_tensor(name: str, shape: list[int], random=False): low = 0.0 high = 1.0 total_elements = 1 @@ -113,7 +112,7 @@ def create_inputs_and_outputs(self, model_type: str = ""): outputs.append(helper.make_tensor_value_info("past_seq_len_plus_zero", TensorProto.FLOAT, [1])) return inputs, outputs - def create_fused_model(self, interleaved: bool, initializers: List[TensorProto]): + def create_fused_model(self, interleaved: bool, initializers: list[TensorProto]): inputs, outputs = self.create_inputs_and_outputs() rope_node = helper.make_node( @@ -385,7 +384,7 @@ def create_apply_rope_path(self): return x_half_shape_nodes + rotate_half_nodes + x_embed_nodes - def create_test_model(self, model_type: str, use_redundant_squeeze_ops: bool, initializers: List[TensorProto]): + def create_test_model(self, model_type: str, use_redundant_squeeze_ops: bool, initializers: list[TensorProto]): apply_rope_nodes = self.create_apply_rope_path() cache_nodes = self.create_cache_path(model_type, use_redundant_squeeze_ops) inputs, outputs = self.create_inputs_and_outputs(model_type) diff --git a/onnxruntime/test/python/transformers/test_rotary_mha_fusion.py b/onnxruntime/test/python/transformers/test_rotary_mha_fusion.py index aba0ccdac2e6e..0ec5c684532cc 100644 --- a/onnxruntime/test/python/transformers/test_rotary_mha_fusion.py +++ b/onnxruntime/test/python/transformers/test_rotary_mha_fusion.py @@ -6,7 +6,6 @@ import os import sys import unittest -from typing import List import numpy as np import onnx @@ -23,7 +22,7 @@ from onnxruntime.transformers.optimizer import optimize_model -def float_tensor(name: str, shape: List[int], random=False): +def float_tensor(name: str, shape: list[int], random=False): low = 0.0 high = 1.0 total_elements = 1 @@ -157,8 +156,8 @@ def create_rotary_embeddings( is_fused: bool, model_type: str, interleaved: bool, - inputs: List[TensorProto], - initializers: List[TensorProto], + inputs: list[TensorProto], + initializers: list[TensorProto], ): def get_first_rope_input(node_type: str): if is_fused or model_type == "llama2_msft": @@ -974,7 +973,7 @@ def create_qkv_path(self, model_type: str): return qkv_nodes + [transpose_qkv_node, reshape_qkv_2_node] # noqa: RUF005 - def create_concat_unsqueeze_paths(self, model_type: str, reshape_nodes: List[NodeProto]): + def create_concat_unsqueeze_paths(self, model_type: str, reshape_nodes: list[NodeProto]): # Create initial shape paths shape_0_node = helper.make_node( "Shape", @@ -1097,7 +1096,7 @@ def create_end_nodes(self, model_type): ) return [matmul_o_node, end_node] - def create_fused_model(self, model_type: str, interleaved: bool, initializers: List[TensorProto]): + def create_fused_model(self, model_type: str, interleaved: bool, initializers: list[TensorProto]): inputs, outputs = self.create_inputs_and_outputs(model_type) matmul_nodes = self.create_matmul_nodes(True, model_type=model_type) rope_nodes = self.create_rotary_embeddings(True, model_type, interleaved, inputs, initializers) @@ -1134,7 +1133,7 @@ def create_fused_model(self, model_type: str, interleaved: bool, initializers: L model = helper.make_model(graph, opset_imports=[opset_import]) return model - def create_test_model(self, model_type: str, interleaved: bool, initializers: List[TensorProto]): + def create_test_model(self, model_type: str, interleaved: bool, initializers: list[TensorProto]): inputs, outputs = self.create_inputs_and_outputs(model_type) matmul_nodes = self.create_matmul_nodes(False, model_type) rope_nodes = self.create_rotary_embeddings(False, model_type, interleaved, inputs, initializers) diff --git a/onnxruntime/test/python/transformers/test_simplified_layernorm_fusion.py b/onnxruntime/test/python/transformers/test_simplified_layernorm_fusion.py index e86bdda7baffb..95639958dbb2e 100644 --- a/onnxruntime/test/python/transformers/test_simplified_layernorm_fusion.py +++ b/onnxruntime/test/python/transformers/test_simplified_layernorm_fusion.py @@ -5,7 +5,6 @@ import os import unittest -from typing import List import numpy as np import onnx @@ -22,7 +21,7 @@ from onnxruntime.transformers.optimizer import optimize_model -def float_tensor(name: str, shape: List[int], random=False): +def float_tensor(name: str, shape: list[int], random=False): low = 0.0 high = 1.0 total_elements = 1 @@ -115,7 +114,7 @@ def create_inputs_and_outputs(self, start_node_type: str): ] return inputs, outputs, start_node - def create_fused_model(self, start_node_type: str, initializers: List[TensorProto]): + def create_fused_model(self, start_node_type: str, initializers: list[TensorProto]): inputs, outputs, start_node = self.create_inputs_and_outputs(start_node_type) sln_node = helper.make_node( @@ -139,7 +138,7 @@ def create_fused_model(self, start_node_type: str, initializers: List[TensorProt return model # Notation follows https://onnx.ai/onnx/operators/onnx__LayerNormalization.html#summary - def create_test_model(self, start_node_type: str, first_parent_idx: int, initializers: List[TensorProto]): + def create_test_model(self, start_node_type: str, first_parent_idx: int, initializers: list[TensorProto]): end_node = helper.make_node( "Mul", inputs=["scale", "Normalized"] if first_parent_idx == 1 else ["Normalized", "scale"], @@ -197,7 +196,7 @@ def create_test_model(self, start_node_type: str, first_parent_idx: int, initial model = helper.make_model(graph, opset_imports=[opset_import]) return model - def check_models(self, start_node_type: str, first_parent_idx: int, initializers: List[TensorProto]): + def check_models(self, start_node_type: str, first_parent_idx: int, initializers: list[TensorProto]): expected_model_filename = "expected_model.onnx" expected_model = self.create_fused_model(start_node_type, initializers) onnx.save(expected_model, expected_model_filename) diff --git a/onnxruntime/test/python/transformers/test_skip_layer_norm_fusion.py b/onnxruntime/test/python/transformers/test_skip_layer_norm_fusion.py index 5b3a3f18cd744..a55ff5aa91519 100644 --- a/onnxruntime/test/python/transformers/test_skip_layer_norm_fusion.py +++ b/onnxruntime/test/python/transformers/test_skip_layer_norm_fusion.py @@ -6,7 +6,6 @@ import os import unittest -from typing import Dict, List import numpy as np import onnx @@ -21,7 +20,7 @@ from onnxruntime.transformers.optimizer import optimize_model -def float_tensor(name: str, shape: List[int], random=False): +def float_tensor(name: str, shape: list[int], random=False): low = 0.0 high = 1.0 total_elements = 1 @@ -35,9 +34,9 @@ class TestFusion(unittest.TestCase): def verify_skip_layer_norm_fusion( self, model_path: str, - expected_counter: Dict[str, int], - expected_inputs: List[str], - expected_outputs: List[str], + expected_counter: dict[str, int], + expected_inputs: list[str], + expected_outputs: list[str], ): options = FusionOptions("bert") optimized_model = optimize_model(model_path, optimization_options=options, opt_level=0) diff --git a/onnxruntime/test/python/transformers/test_sparse_attention.py b/onnxruntime/test/python/transformers/test_sparse_attention.py index 774761afddc8a..eac6bbdc3dd12 100644 --- a/onnxruntime/test/python/transformers/test_sparse_attention.py +++ b/onnxruntime/test/python/transformers/test_sparse_attention.py @@ -9,7 +9,6 @@ import math import unittest -from typing import Optional, Union import torch from benchmark_mha import InputFormats @@ -34,7 +33,7 @@ def __init__( num_heads: int, kv_num_heads: int, head_size: int, - softmax_scale: Optional[float], + softmax_scale: float | None, do_rotary: bool, rotary_interleaved: bool, provider: str = "CUDAExecutionProvider", @@ -602,8 +601,8 @@ def group_query_attention_reference( key: Tensor, value: Tensor, config: GroupQueryAttentionConfig, - scale: Optional[float] = None, - mask: Optional[Tensor] = None, + scale: float | None = None, + mask: Tensor | None = None, ): if scale is None: scale = 1.0 / (config.head_size**0.5) @@ -704,7 +703,7 @@ def infer(self): def create_ort_session( - config: Union[SparseAttentionConfig, GroupQueryAttentionConfig], session_options=None, enable_cuda_graph=False + config: SparseAttentionConfig | GroupQueryAttentionConfig, session_options=None, enable_cuda_graph=False ) -> CudaSession: if isinstance(config, SparseAttentionConfig): onnx_model_str = create_sparse_attention_onnx_model(config) diff --git a/onnxruntime/test/python/transformers/whisper_model_generator.py b/onnxruntime/test/python/transformers/whisper_model_generator.py index 71d1a4cbdceeb..f1a692b7694cb 100644 --- a/onnxruntime/test/python/transformers/whisper_model_generator.py +++ b/onnxruntime/test/python/transformers/whisper_model_generator.py @@ -4,7 +4,6 @@ # license information. # -------------------------------------------------------------------------- -from typing import List import numpy as np import onnx @@ -13,7 +12,7 @@ # Adapted from bert_model_generator.py -def get_tensor_and_weight(name: str, shape: List[int], random=False, zeros=False): +def get_tensor_and_weight(name: str, shape: list[int], random=False, zeros=False): low = 0.0 high = 1.0 total_elements = 1 diff --git a/onnxruntime/test/testdata/CNTK/gen.py b/onnxruntime/test/testdata/CNTK/gen.py index 5a3ca461f471a..b5f39bcb448f9 100644 --- a/onnxruntime/test/testdata/CNTK/gen.py +++ b/onnxruntime/test/testdata/CNTK/gen.py @@ -23,7 +23,7 @@ def SaveTensorProto(file_path, variable, data, name): # noqa: N802 def SaveData(test_data_dir, prefix, variables, data_list, name_replacements=None): # noqa: N802 if isinstance(data_list, np.ndarray): data_list = [data_list] - for (i, d), v in zip(enumerate(data_list), variables): + for (i, d), v in zip(enumerate(data_list), variables, strict=False): SaveTensorProto( os.path.join(test_data_dir, f"{prefix}_{i}.pb"), v, diff --git a/onnxruntime/test/testdata/dynamic_quantize_matmul_test.py b/onnxruntime/test/testdata/dynamic_quantize_matmul_test.py index 8e6dbe5ea581d..594da08abb1fb 100644 --- a/onnxruntime/test/testdata/dynamic_quantize_matmul_test.py +++ b/onnxruntime/test/testdata/dynamic_quantize_matmul_test.py @@ -1,5 +1,3 @@ -from enum import Enum # noqa: F401 - import onnx from onnx import TensorProto, helper diff --git a/onnxruntime/test/testdata/ep_partitioning_tests.py b/onnxruntime/test/testdata/ep_partitioning_tests.py index 6c8322bb9bd62..367cafb795bad 100644 --- a/onnxruntime/test/testdata/ep_partitioning_tests.py +++ b/onnxruntime/test/testdata/ep_partitioning_tests.py @@ -1,4 +1,3 @@ -import numpy as np # noqa: F401 import onnx from onnx import TensorProto, helper diff --git a/onnxruntime/test/testdata/matmul_integer_to_float.py b/onnxruntime/test/testdata/matmul_integer_to_float.py index e6c51009018f9..0c1ea47fff5b1 100644 --- a/onnxruntime/test/testdata/matmul_integer_to_float.py +++ b/onnxruntime/test/testdata/matmul_integer_to_float.py @@ -1,5 +1,3 @@ -from enum import Enum # noqa: F401 - import onnx from onnx import TensorProto, helper diff --git a/onnxruntime/test/testdata/sparse_initializer_as_output.py b/onnxruntime/test/testdata/sparse_initializer_as_output.py index b10c84ccc1723..25d66b40a7c73 100644 --- a/onnxruntime/test/testdata/sparse_initializer_as_output.py +++ b/onnxruntime/test/testdata/sparse_initializer_as_output.py @@ -1,21 +1,13 @@ import argparse -import os # noqa: F401 import sys import traceback -from typing import Any, Callable, Dict, List, Optional, Sequence, Text, Tuple, TypeVar, Union, cast # noqa: F401 import numpy as np import onnx from onnx import ( - AttributeProto, # noqa: F401 - GraphProto, # noqa: F401 - SparseTensorProto, # noqa: F401 TensorProto, ValueInfoProto, helper, - mapping, # noqa: F401 - numpy_helper, # noqa: F401 - utils, # noqa: F401 ) from onnx.helper import make_opsetid diff --git a/onnxruntime/test/testdata/sparse_to_dense_matmul.py b/onnxruntime/test/testdata/sparse_to_dense_matmul.py index 57a15ba72308e..5a8a00cc7748e 100644 --- a/onnxruntime/test/testdata/sparse_to_dense_matmul.py +++ b/onnxruntime/test/testdata/sparse_to_dense_matmul.py @@ -1,21 +1,12 @@ import argparse -import os # noqa: F401 import sys import traceback -from typing import Any, Callable, Dict, List, Optional, Sequence, Text, Tuple, TypeVar, Union, cast # noqa: F401 -import numpy as np # noqa: F401 import onnx from onnx import ( - AttributeProto, # noqa: F401 - GraphProto, # noqa: F401 - SparseTensorProto, # noqa: F401 TensorProto, ValueInfoProto, helper, - mapping, # noqa: F401 - numpy_helper, # noqa: F401 - utils, # noqa: F401 ) from onnx.helper import make_opsetid diff --git a/onnxruntime/test/testdata/transform/computation_reduction.py b/onnxruntime/test/testdata/transform/computation_reduction.py index 6f726a54261ed..af0a39636f9ee 100644 --- a/onnxruntime/test/testdata/transform/computation_reduction.py +++ b/onnxruntime/test/testdata/transform/computation_reduction.py @@ -1,6 +1,6 @@ import numpy as np import onnx -from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper # noqa: F401 +from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper vocab_size = 256 # 30258 diff --git a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_add.py b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_add.py index cd823ce8391c2..7caf7045ccb93 100755 --- a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_add.py +++ b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_add.py @@ -1,6 +1,6 @@ import numpy as np import onnx -from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper # noqa: F401 +from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper X = helper.make_tensor_value_info("input", TensorProto.FLOAT, ["batch", "seqlen", 128]) unsqueezed_masked_lm_positions = helper.make_tensor_value_info( diff --git a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_div.py b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_div.py index ee25bef5c1161..86413b8679a56 100755 --- a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_div.py +++ b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_div.py @@ -1,6 +1,6 @@ import numpy as np import onnx -from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper # noqa: F401 +from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper X = helper.make_tensor_value_info("input", TensorProto.FLOAT, ["batch", "seqlen", 128]) unsqueezed_masked_lm_positions = helper.make_tensor_value_info( diff --git a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_layernormalization.py b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_layernormalization.py index dc2abf1dda586..ffaf62a243359 100755 --- a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_layernormalization.py +++ b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_layernormalization.py @@ -1,6 +1,6 @@ import numpy as np import onnx -from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper # noqa: F401 +from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper X = helper.make_tensor_value_info("input", TensorProto.FLOAT, ["batch", "seqlen", 128]) unsqueezed_masked_lm_positions = helper.make_tensor_value_info( diff --git a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_matmul.py b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_matmul.py index bc850c4031741..65767a8986746 100755 --- a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_matmul.py +++ b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_matmul.py @@ -1,6 +1,6 @@ import numpy as np import onnx -from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper # noqa: F401 +from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper X = helper.make_tensor_value_info("input", TensorProto.FLOAT, ["batch", "seqlen", 128]) unsqueezed_masked_lm_positions = helper.make_tensor_value_info( diff --git a/onnxruntime/test/testdata/transform/concat_slice_elimination.py b/onnxruntime/test/testdata/transform/concat_slice_elimination.py index 9eade63328aec..97f0c6f243f60 100644 --- a/onnxruntime/test/testdata/transform/concat_slice_elimination.py +++ b/onnxruntime/test/testdata/transform/concat_slice_elimination.py @@ -1,8 +1,6 @@ -import random # noqa: F401 - import numpy as np import onnx -from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper # noqa: F401 +from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper batch = 3 hidden_size = 4 diff --git a/onnxruntime/test/testdata/transform/cse/generate.py b/onnxruntime/test/testdata/transform/cse/generate.py index ecca4f586f400..01d62422983b5 100644 --- a/onnxruntime/test/testdata/transform/cse/generate.py +++ b/onnxruntime/test/testdata/transform/cse/generate.py @@ -1,7 +1,7 @@ import os import onnx -from onnx import AttributeProto, GraphProto, TensorProto, helper, shape_inference # noqa: F401 +from onnx import TensorProto, helper, shape_inference _this_dir = os.path.abspath(os.path.dirname(__file__)) diff --git a/onnxruntime/test/testdata/transform/expand_elimination.py b/onnxruntime/test/testdata/transform/expand_elimination.py index 86340c9e2553c..226c23fa66389 100644 --- a/onnxruntime/test/testdata/transform/expand_elimination.py +++ b/onnxruntime/test/testdata/transform/expand_elimination.py @@ -1,6 +1,6 @@ import numpy as np import onnx -from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper # noqa: F401 +from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper X1 = helper.make_tensor_value_info("input1", TensorProto.FLOAT, [2, 1]) X2 = helper.make_tensor_value_info("input2", TensorProto.FLOAT, ["dynamic", 4]) diff --git a/onnxruntime/test/testdata/transform/fusion/attention_gen.py b/onnxruntime/test/testdata/transform/fusion/attention_gen.py index 19f46ab9f358a..6ff0ea5ba9983 100644 --- a/onnxruntime/test/testdata/transform/fusion/attention_gen.py +++ b/onnxruntime/test/testdata/transform/fusion/attention_gen.py @@ -1,5 +1,4 @@ import sys -from enum import Enum # noqa: F401 import onnx from onnx import TensorProto, helper diff --git a/onnxruntime/test/testdata/transform/fusion/constant_folding_with_shape_to_initializer.py b/onnxruntime/test/testdata/transform/fusion/constant_folding_with_shape_to_initializer.py index c49ae8b0a422c..65b37a8ed9dab 100644 --- a/onnxruntime/test/testdata/transform/fusion/constant_folding_with_shape_to_initializer.py +++ b/onnxruntime/test/testdata/transform/fusion/constant_folding_with_shape_to_initializer.py @@ -1,6 +1,6 @@ import numpy as np import onnx -from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper # noqa: F401 +from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper X = helper.make_tensor_value_info("input", TensorProto.FLOAT, [2, 4, 8]) Y = helper.make_tensor_value_info("output", TensorProto.FLOAT, [2, 4, 16]) diff --git a/onnxruntime/test/testdata/transform/fusion/div_mul.py b/onnxruntime/test/testdata/transform/fusion/div_mul.py index 8cd34a6b53fcf..e7b1f4632afbd 100644 --- a/onnxruntime/test/testdata/transform/fusion/div_mul.py +++ b/onnxruntime/test/testdata/transform/fusion/div_mul.py @@ -1,5 +1,3 @@ -from enum import Enum # noqa: F401 - import onnx from onnx import OperatorSetIdProto, TensorProto, helper diff --git a/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.py b/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.py index 3ec3cabbc8b77..e590b46129d7b 100644 --- a/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.py +++ b/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.py @@ -1,5 +1,3 @@ -from enum import Enum # noqa: F401 - import onnx from onnx import TensorProto, helper diff --git a/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_gen.py b/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_gen.py index 54fe7b808bf12..f83bedeb8012c 100644 --- a/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_gen.py +++ b/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_gen.py @@ -1,5 +1,3 @@ -from enum import Enum # noqa: F401 - import onnx from onnx import TensorProto, helper from packaging import version diff --git a/onnxruntime/test/testdata/transform/fusion/fast_gelu.py b/onnxruntime/test/testdata/transform/fusion/fast_gelu.py index 20d78b6684609..a16d7e66752bf 100644 --- a/onnxruntime/test/testdata/transform/fusion/fast_gelu.py +++ b/onnxruntime/test/testdata/transform/fusion/fast_gelu.py @@ -1,6 +1,6 @@ import numpy as np import onnx -from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper # noqa: F401 +from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper # Gelu formula: x * 0.5 * (1.0 + tanh(0.7978845608028654 * x * (1.0 + 0.044715 * x * x))) diff --git a/onnxruntime/test/testdata/transform/fusion/fast_gelu2.py b/onnxruntime/test/testdata/transform/fusion/fast_gelu2.py index 718f924ae5902..6922f3ad0a82a 100644 --- a/onnxruntime/test/testdata/transform/fusion/fast_gelu2.py +++ b/onnxruntime/test/testdata/transform/fusion/fast_gelu2.py @@ -1,6 +1,6 @@ import numpy as np import onnx -from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper # noqa: F401 +from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper # Gelu formula: x * 0.5 * (1.0 + tanh((sqrt(2 / pi) * (x + 0.044715 * pow(x, 3))))) has_bias = False # change it to True to generate fast_gelu_openai_with_bias.onnx diff --git a/onnxruntime/test/testdata/transform/fusion/fast_gelu3_with_casts.py b/onnxruntime/test/testdata/transform/fusion/fast_gelu3_with_casts.py index d7cfc351b8e97..d91e186296137 100644 --- a/onnxruntime/test/testdata/transform/fusion/fast_gelu3_with_casts.py +++ b/onnxruntime/test/testdata/transform/fusion/fast_gelu3_with_casts.py @@ -1,6 +1,6 @@ import numpy as np import onnx -from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper # noqa: F401 +from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper # Gelu formula: x * 0.5 * (1.0 + tanh((sqrt(2 / pi) * (x + 0.044715 * pow(x, 3))))) diff --git a/onnxruntime/test/testdata/transform/fusion/gelu_gen.py b/onnxruntime/test/testdata/transform/fusion/gelu_gen.py index 428bb0ce00df0..8a4c3ae491215 100644 --- a/onnxruntime/test/testdata/transform/fusion/gelu_gen.py +++ b/onnxruntime/test/testdata/transform/fusion/gelu_gen.py @@ -1,6 +1,6 @@ import numpy as np import onnx -from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper # noqa: F401 +from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper """ Generate test model for Gelu subgraph pattern 2: diff --git a/onnxruntime/test/testdata/transform/fusion/isinf_reducesum.py b/onnxruntime/test/testdata/transform/fusion/isinf_reducesum.py index c6e70fe478701..a9c88618c5c70 100644 --- a/onnxruntime/test/testdata/transform/fusion/isinf_reducesum.py +++ b/onnxruntime/test/testdata/transform/fusion/isinf_reducesum.py @@ -1,5 +1,3 @@ -from enum import Enum # noqa: F401 - import onnx from onnx import OperatorSetIdProto, TensorProto, helper diff --git a/onnxruntime/test/testdata/transform/fusion/layer_norm_t5_gen.py b/onnxruntime/test/testdata/transform/fusion/layer_norm_t5_gen.py index aa4b78f4525de..c0e2bc85f8248 100644 --- a/onnxruntime/test/testdata/transform/fusion/layer_norm_t5_gen.py +++ b/onnxruntime/test/testdata/transform/fusion/layer_norm_t5_gen.py @@ -1,5 +1,3 @@ -from enum import Enum # noqa: F401 - import onnx from onnx import OperatorSetIdProto, TensorProto, helper diff --git a/onnxruntime/test/testdata/transform/fusion/layer_norm_with_cast_2.py b/onnxruntime/test/testdata/transform/fusion/layer_norm_with_cast_2.py index 61b2e2249e7a3..fa83290138d87 100644 --- a/onnxruntime/test/testdata/transform/fusion/layer_norm_with_cast_2.py +++ b/onnxruntime/test/testdata/transform/fusion/layer_norm_with_cast_2.py @@ -1,6 +1,3 @@ -from enum import Enum # noqa: F401 - -import numpy as np # noqa: F401 import onnx from onnx import OperatorSetIdProto, TensorProto, helper diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py index 018e5fb332dd0..f9b154c46fbd1 100644 --- a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py +++ b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py @@ -1,5 +1,3 @@ -from enum import Enum # noqa: F401 - import onnx from onnx import TensorProto, helper diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float_large_tensor.py b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float_large_tensor.py index 543517cc015ef..6b60a47255c5d 100644 --- a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float_large_tensor.py +++ b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float_large_tensor.py @@ -1,5 +1,3 @@ -from enum import Enum # noqa: F401 - import onnx from onnx import TensorProto, helper diff --git a/onnxruntime/test/testdata/transform/fusion/not_where.py b/onnxruntime/test/testdata/transform/fusion/not_where.py index 82a128153ac70..014d0b8fc531a 100644 --- a/onnxruntime/test/testdata/transform/fusion/not_where.py +++ b/onnxruntime/test/testdata/transform/fusion/not_where.py @@ -1,5 +1,3 @@ -from enum import Enum # noqa: F401 - import onnx from onnx import OperatorSetIdProto, TensorProto, helper diff --git a/onnxruntime/test/testdata/transform/id-elim.py b/onnxruntime/test/testdata/transform/id-elim.py index 1f7b6e2607702..eef8011e7fe23 100644 --- a/onnxruntime/test/testdata/transform/id-elim.py +++ b/onnxruntime/test/testdata/transform/id-elim.py @@ -1,6 +1,5 @@ -import numpy as np # noqa: F401 import onnx -from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper # noqa: F401 +from onnx import OperatorSetIdProto, TensorProto, helper X1 = helper.make_tensor_value_info("x1", TensorProto.INT64, [4, 4]) X2 = helper.make_tensor_value_info("x2", TensorProto.INT64, [4, 4]) diff --git a/onnxruntime/test/testdata/transform/id-scan9_sum.py b/onnxruntime/test/testdata/transform/id-scan9_sum.py index 7ffd2e21b7333..c813bbfc18d8e 100644 --- a/onnxruntime/test/testdata/transform/id-scan9_sum.py +++ b/onnxruntime/test/testdata/transform/id-scan9_sum.py @@ -1,6 +1,5 @@ -import numpy as np # noqa: F401 import onnx -from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper # noqa: F401 +from onnx import OperatorSetIdProto, TensorProto, helper initial = helper.make_tensor_value_info("initial", TensorProto.FLOAT, [2]) x = helper.make_tensor_value_info("x", TensorProto.FLOAT, [3, 2]) diff --git a/onnxruntime/test/testdata/transform/model_parallel/bart_mlp_megatron_basic_test.py b/onnxruntime/test/testdata/transform/model_parallel/bart_mlp_megatron_basic_test.py index 503d860baab67..7879bb4d4e0ff 100644 --- a/onnxruntime/test/testdata/transform/model_parallel/bart_mlp_megatron_basic_test.py +++ b/onnxruntime/test/testdata/transform/model_parallel/bart_mlp_megatron_basic_test.py @@ -1,6 +1,6 @@ import numpy as np import onnx -from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper # noqa: F401 +from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper hidden_size = 4 weight_dim_to_split = 16 diff --git a/onnxruntime/test/testdata/transform/model_parallel/bart_self_attention_megatron_basic_test.py b/onnxruntime/test/testdata/transform/model_parallel/bart_self_attention_megatron_basic_test.py index 20bdebead3dac..886cd5c25fb08 100644 --- a/onnxruntime/test/testdata/transform/model_parallel/bart_self_attention_megatron_basic_test.py +++ b/onnxruntime/test/testdata/transform/model_parallel/bart_self_attention_megatron_basic_test.py @@ -1,8 +1,6 @@ -import random # noqa: F401 - import numpy as np import onnx -from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper # noqa: F401 +from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper batch = 6 hidden_size = 4 diff --git a/onnxruntime/test/testdata/transform/model_parallel/mlp_megatron_basic_test.py b/onnxruntime/test/testdata/transform/model_parallel/mlp_megatron_basic_test.py index 07487ee4880ed..5dec4899d59af 100644 --- a/onnxruntime/test/testdata/transform/model_parallel/mlp_megatron_basic_test.py +++ b/onnxruntime/test/testdata/transform/model_parallel/mlp_megatron_basic_test.py @@ -1,6 +1,6 @@ import numpy as np import onnx -from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper # noqa: F401 +from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper hidden_size = 4 weight_dim_to_split = 16 diff --git a/onnxruntime/test/testdata/transform/model_parallel/self_attention_megatron_basic_test.py b/onnxruntime/test/testdata/transform/model_parallel/self_attention_megatron_basic_test.py index 306ad7d37403a..3749da038d93e 100644 --- a/onnxruntime/test/testdata/transform/model_parallel/self_attention_megatron_basic_test.py +++ b/onnxruntime/test/testdata/transform/model_parallel/self_attention_megatron_basic_test.py @@ -1,6 +1,6 @@ import numpy as np import onnx -from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper # noqa: F401 +from onnx import OperatorSetIdProto, TensorProto, helper, numpy_helper hidden_size = 4 attention_head = 2 diff --git a/orttraining/orttraining/python/training/artifacts.py b/orttraining/orttraining/python/training/artifacts.py index 31591c0156b14..c304d2f262650 100644 --- a/orttraining/orttraining/python/training/artifacts.py +++ b/orttraining/orttraining/python/training/artifacts.py @@ -6,7 +6,6 @@ import os import pathlib from enum import Enum -from typing import List, Optional, Union import onnx @@ -40,18 +39,18 @@ class OptimType(Enum): def generate_artifacts( - model: Union[onnx.ModelProto, str], - requires_grad: Optional[List[str]] = None, - frozen_params: Optional[List[str]] = None, - loss: Optional[Union[LossType, onnxblock.Block]] = None, - optimizer: Optional[Union[OptimType, onnxblock.Block]] = None, - artifact_directory: Optional[Union[str, bytes, os.PathLike]] = None, + model: onnx.ModelProto | str, + requires_grad: list[str] | None = None, + frozen_params: list[str] | None = None, + loss: LossType | onnxblock.Block | None = None, + optimizer: OptimType | onnxblock.Block | None = None, + artifact_directory: str | bytes | os.PathLike | None = None, prefix: str = "", ort_format: bool = False, - custom_op_library: Optional[Union[str, bytes, os.PathLike]] = None, - additional_output_names: Optional[List[str]] = None, + custom_op_library: str | bytes | os.PathLike | None = None, + additional_output_names: list[str] | None = None, nominal_checkpoint: bool = False, - loss_input_names: Optional[List[str]] = None, + loss_input_names: list[str] | None = None, ) -> None: """Generates artifacts required for training with ORT training api. diff --git a/orttraining/orttraining/python/training/experimental/gradient_graph/_gradient_graph_tools.py b/orttraining/orttraining/python/training/experimental/gradient_graph/_gradient_graph_tools.py index 5ab79b3712472..9ea12753a254b 100644 --- a/orttraining/orttraining/python/training/experimental/gradient_graph/_gradient_graph_tools.py +++ b/orttraining/orttraining/python/training/experimental/gradient_graph/_gradient_graph_tools.py @@ -1,6 +1,7 @@ import io +from collections.abc import Callable from pathlib import Path -from typing import Any, Callable, Optional, Union # noqa: F401 +from typing import Any import torch from torch.onnx import TrainingMode @@ -15,7 +16,7 @@ def export_gradient_graph( loss_fn: Callable[[Any, Any], Any], example_input: torch.Tensor, example_labels: torch.Tensor, - gradient_graph_path: Union[Path, str], + gradient_graph_path: Path | str, opset_version=12, ) -> None: r""" @@ -45,7 +46,7 @@ def export_gradient_graph( class WrapperModule(torch.nn.Module): def forward(self, model_input, expected_labels, *model_params): - for param, set_param in zip(model.parameters(), model_params): + for param, set_param in zip(model.parameters(), model_params, strict=False): param.data = set_param.data output = model(model_input) loss = loss_fn(output, expected_labels) diff --git a/orttraining/orttraining/python/training/onnxblock/_graph_utils.py b/orttraining/orttraining/python/training/onnxblock/_graph_utils.py index 42743a4200d17..fd10e6b65fb84 100644 --- a/orttraining/orttraining/python/training/onnxblock/_graph_utils.py +++ b/orttraining/orttraining/python/training/onnxblock/_graph_utils.py @@ -1,7 +1,6 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. -from typing import List, Union import onnx @@ -43,7 +42,7 @@ def generate_graph_name(token: str) -> str: return f"onnx::{token}::{_get_token()}" -def register_graph_outputs(model: onnx.ModelProto, output_names: Union[List[str], str]) -> None: +def register_graph_outputs(model: onnx.ModelProto, output_names: list[str] | str) -> None: """Register the given output names as graph outputs. The graph outputs shape information is extracted from the graph value_infos and diff --git a/orttraining/orttraining/python/training/onnxblock/_training_graph_utils.py b/orttraining/orttraining/python/training/onnxblock/_training_graph_utils.py index 1213342004d48..fbdbac3504b65 100644 --- a/orttraining/orttraining/python/training/onnxblock/_training_graph_utils.py +++ b/orttraining/orttraining/python/training/onnxblock/_training_graph_utils.py @@ -4,7 +4,6 @@ import copy import logging import os -from typing import List, Optional, Set, Tuple, Union import onnx @@ -35,7 +34,7 @@ def disable_training_mode_batchnorm(node): ops_to_disable_training_mode_func_map[node.op_type](node) -def _reorder_outputs(model: onnx.ModelProto, user_output_names: List[str], requires_grad: Set[str]) -> None: +def _reorder_outputs(model: onnx.ModelProto, user_output_names: list[str], requires_grad: set[str]) -> None: """Reorders the outputs of the model to match the order of [user_outputs, gradients]""" graph_outputs = {output.name: output for output in model.graph.output} @@ -50,7 +49,7 @@ def _reorder_outputs(model: onnx.ModelProto, user_output_names: List[str], requi model.graph.output.extend(ordered_graph_outputs) -def _move_initializers_to_inputs(model: onnx.ModelProto, initializer_names: Optional[Set[str]] = None) -> None: +def _move_initializers_to_inputs(model: onnx.ModelProto, initializer_names: set[str] | None = None) -> None: # Move all trainable and non trainable initializers to graph inputs. # This allows training to pass in the parameters from outside the graph # so as to share the parameters across multiple sessions. @@ -70,9 +69,9 @@ def _move_initializers_to_inputs(model: onnx.ModelProto, initializer_names: Opti def _gradient_model_for( model: onnx.ModelProto, - requires_grad: Set[str], + requires_grad: set[str], loss_name: str, - options: Optional[SessionOptions] = None, + options: SessionOptions | None = None, ) -> onnx.ModelProto: """Builds the gradient graph on top of the given input forward only graph.""" @@ -87,11 +86,11 @@ def _gradient_model_for( def build_gradient_graph( model: onnx.ModelProto, - requires_grad: Set[str], - frozen_params: Set[str], - output_names: Union[List[str], str], - custom_op_library: Optional[str] = None, -) -> Tuple[onnx.ModelProto, onnx.ModelProto]: + requires_grad: set[str], + frozen_params: set[str], + output_names: list[str] | str, + custom_op_library: str | None = None, +) -> tuple[onnx.ModelProto, onnx.ModelProto]: """Prepare the training model and the eval model. This function will restructure the model to prepare for training. @@ -134,7 +133,7 @@ def build_gradient_graph( return gradient_model, eval_model -def build_gradient_accumulation_graph(grad_model: onnx.ModelProto, requires_grad: Set[str]) -> None: +def build_gradient_accumulation_graph(grad_model: onnx.ModelProto, requires_grad: set[str]) -> None: """Builds gradient accumulation nodes on top of a training model. Adds an InPlaceAccumulatorV2 node for every gradient so that the gradients @@ -209,8 +208,8 @@ def build_gradient_accumulation_graph(grad_model: onnx.ModelProto, requires_grad def get_model_parameters( - model: onnx.ModelProto, requires_grad: Set[str], frozen_params: Set[str] -) -> Tuple[List[onnx.TensorProto], List[onnx.TensorProto]]: + model: onnx.ModelProto, requires_grad: set[str], frozen_params: set[str] +) -> tuple[list[onnx.TensorProto], list[onnx.TensorProto]]: """Returns trainable and non trainable onnx model parameters. This function pulls out the model parameters from the initializers in the graph. diff --git a/orttraining/orttraining/python/training/onnxblock/blocks.py b/orttraining/orttraining/python/training/onnxblock/blocks.py index c13843f816f16..24dc263eeb09b 100644 --- a/orttraining/orttraining/python/training/onnxblock/blocks.py +++ b/orttraining/orttraining/python/training/onnxblock/blocks.py @@ -6,7 +6,7 @@ import logging import os from abc import ABC, abstractmethod -from typing import Any, List, Optional +from typing import Any import numpy as np import onnx @@ -402,7 +402,7 @@ def __init__(self, like: str): self._like = like - def build(self, input_name: Optional[str] = None): + def build(self, input_name: str | None = None): cloned_input = None with contextlib.suppress(LookupError): # Suppress LookupError because we want to try to get the input from the output if it's not found in the inputs @@ -428,12 +428,12 @@ def __init__( default_float: float = 0.0, default_int64: int = -1, default_string: str = "_Unused", - keys_floats: Optional[List[float]] = None, - keys_int64s: Optional[List[int]] = None, - keys_strings: Optional[List[str]] = None, - values_floats: Optional[List[float]] = None, - values_int64s: Optional[List[int]] = None, - values_strings: Optional[List[str]] = None, + keys_floats: list[float] | None = None, + keys_int64s: list[int] | None = None, + keys_strings: list[str] | None = None, + values_floats: list[float] | None = None, + values_int64s: list[int] | None = None, + values_strings: list[str] | None = None, ): super().__init__() @@ -443,8 +443,8 @@ def __init__( "default_string": default_string, } - def _add_attributes(names: List[str], values: List[Any]): - for name, value in zip(names, values): + def _add_attributes(names: list[str], values: list[Any]): + for name, value in zip(names, values, strict=False): if value is not None: self._attributes[name] = value diff --git a/orttraining/orttraining/python/training/onnxblock/checkpoint_utils.py b/orttraining/orttraining/python/training/onnxblock/checkpoint_utils.py index de3453c630f9c..74292ea10a522 100644 --- a/orttraining/orttraining/python/training/onnxblock/checkpoint_utils.py +++ b/orttraining/orttraining/python/training/onnxblock/checkpoint_utils.py @@ -2,7 +2,6 @@ # Licensed under the MIT License. import os -from typing import List, Tuple, Union import onnx @@ -11,8 +10,8 @@ def save_checkpoint( - parameters: Tuple[List[onnx.TensorProto], List[onnx.TensorProto]], - path_to_checkpoint: Union[str, os.PathLike], + parameters: tuple[list[onnx.TensorProto], list[onnx.TensorProto]], + path_to_checkpoint: str | os.PathLike, nominal_checkpoint: bool = False, ) -> None: """Saves the parameters to the checkpoint directory path_to_checkpoint. @@ -32,7 +31,7 @@ def save_checkpoint( _save_checkpoint(trainable_params, non_trainable_params, os.fspath(path_to_checkpoint), nominal_checkpoint) -def load_checkpoint_to_model(path_to_checkpoint: Union[str, os.PathLike], model: onnx.ModelProto) -> None: +def load_checkpoint_to_model(path_to_checkpoint: str | os.PathLike, model: onnx.ModelProto) -> None: """Loads the checkpoint to an onnx inference model. Args: diff --git a/orttraining/orttraining/python/training/onnxblock/loss/loss.py b/orttraining/orttraining/python/training/onnxblock/loss/loss.py index 09429dd844187..e0624c6722519 100644 --- a/orttraining/orttraining/python/training/onnxblock/loss/loss.py +++ b/orttraining/orttraining/python/training/onnxblock/loss/loss.py @@ -2,7 +2,6 @@ # Licensed under the MIT License. import copy -from typing import Optional import onnx @@ -62,7 +61,7 @@ class CrossEntropyLoss(blocks.Block): contribute to the input gradient. """ - def __init__(self, weight=None, reduction: str = "mean", ignore_index: Optional[int] = None): + def __init__(self, weight=None, reduction: str = "mean", ignore_index: int | None = None): super().__init__() if reduction not in ["mean", "sum", "none"]: @@ -230,7 +229,7 @@ def __init__(self, reduction: str = "mean"): self._abs = blocks.Abs() self._sub = blocks.Sub() - def build(self, loss_input_name: str, target_name: Optional[str] = "target"): + def build(self, loss_input_name: str, target_name: str | None = "target"): """Adds an L1 loss subgraph on top of the base_model. Args: diff --git a/orttraining/orttraining/python/training/onnxblock/onnxblock.py b/orttraining/orttraining/python/training/onnxblock/onnxblock.py index 64f7acf4dc02c..0cb42cce9e5d5 100644 --- a/orttraining/orttraining/python/training/onnxblock/onnxblock.py +++ b/orttraining/orttraining/python/training/onnxblock/onnxblock.py @@ -3,7 +3,6 @@ import logging from abc import abstractmethod -from typing import List, Tuple import onnx @@ -139,7 +138,7 @@ def requires_grad(self, argument_name: str, value: bool = True): self._requires_grad.remove(argument_name) self._frozen_params.add(argument_name) - def parameters(self) -> Tuple[List[onnx.TensorProto], List[onnx.TensorProto]]: + def parameters(self) -> tuple[list[onnx.TensorProto], list[onnx.TensorProto]]: """Trainable as well as non-trainable (frozen) parameters of the model. Model parameters that are extracted while building the training model @@ -161,7 +160,7 @@ def parameters(self) -> Tuple[List[onnx.TensorProto], List[onnx.TensorProto]]: return self._parameters - def to_model_proto(self) -> Tuple[onnx.ModelProto, onnx.ModelProto]: + def to_model_proto(self) -> tuple[onnx.ModelProto, onnx.ModelProto]: """Returns the training and eval models. Once the gradient graph is built, the training and eval models can be retrieved diff --git a/orttraining/orttraining/python/training/onnxblock/optim/optim.py b/orttraining/orttraining/python/training/onnxblock/optim/optim.py index d14b2efefe916..a18fe7e6414e2 100644 --- a/orttraining/orttraining/python/training/onnxblock/optim/optim.py +++ b/orttraining/orttraining/python/training/onnxblock/optim/optim.py @@ -1,7 +1,6 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. -from typing import Dict, List, Optional, Tuple import onnx @@ -66,10 +65,10 @@ def __init__(self): def _build_optimizer_node( self, - input_names: List[str], + input_names: list[str], output_name: str, node_name: str, - node_attributes: Dict, + node_attributes: dict, ) -> str: """ Build and append an optimizer node to the ONNX graph. @@ -135,10 +134,10 @@ def build( class AdamWOptimizer(_OptimizerBase): def __init__( self, - bias_correction: Optional[bool] = True, - betas: Tuple[float, float] = (0.9, 0.999), - eps: Optional[float] = 1e-6, - weight_decay: Optional[float] = 0.0, + bias_correction: bool | None = True, + betas: tuple[float, float] = (0.9, 0.999), + eps: float | None = 1e-6, + weight_decay: float | None = 0.0, ): super().__init__() @@ -242,7 +241,7 @@ def _optimizer_specific_logic( learning_rate_name: str, params_name: str, gradients_name: str, - trainable_parameters: Tuple[List[onnx.TensorProto], List[onnx.TensorProto]], + trainable_parameters: tuple[list[onnx.TensorProto], list[onnx.TensorProto]], ) -> str: raise NotImplementedError("Subclasses must implement _optimizer_specific_logic method.") @@ -264,7 +263,7 @@ def _optimizer_specific_logic( learning_rate_name: str, params_name: str, gradients_name: str, - trainable_parameters: Tuple[List[onnx.TensorProto], List[onnx.TensorProto]], + trainable_parameters: tuple[list[onnx.TensorProto], list[onnx.TensorProto]], ) -> str: onnx_model = self.base step_name = "step" @@ -307,7 +306,7 @@ def _optimizer_specific_logic( learning_rate_name: str, params_name: str, gradients_name: str, - trainable_parameters: Tuple[List[onnx.TensorProto], List[onnx.TensorProto]], + trainable_parameters: tuple[list[onnx.TensorProto], list[onnx.TensorProto]], ) -> str: onnx_model = self.base updated_flag_name = self._sgd(learning_rate_name, params_name, gradients_name) diff --git a/orttraining/orttraining/python/training/optim/_ds_modifier.py b/orttraining/orttraining/python/training/optim/_ds_modifier.py index 55e2e08432137..9d8f178c1c65c 100644 --- a/orttraining/orttraining/python/training/optim/_ds_modifier.py +++ b/orttraining/orttraining/python/training/optim/_ds_modifier.py @@ -178,7 +178,7 @@ def is_model_parallel_parameter(p): #### THIS IS THE FASTER IMPLEMENTATION #### grads_for_norm = [] - for g, p in zip(gradients, params): + for g, p in zip(gradients, params, strict=False): if is_model_parallel_parameter(p) or (target.model_parallel_rank == 0): # BE NOTED: deepspeed original give a double type conversion here, not sure whether this is impacting some models. # https://github.com/microsoft/DeepSpeed/blob/9e5c0c5c3ecabb68b7e9dffac0e9b8d167e3cab8/deepspeed/runtime/zero/stage2.py#L1501 diff --git a/orttraining/orttraining/python/training/ort_triton/_cache.py b/orttraining/orttraining/python/training/ort_triton/_cache.py index b70064377abfc..294c844bb5ac5 100644 --- a/orttraining/orttraining/python/training/ort_triton/_cache.py +++ b/orttraining/orttraining/python/training/ort_triton/_cache.py @@ -12,7 +12,6 @@ import sys import tempfile from types import ModuleType -from typing import Tuple @functools.lru_cache(None) @@ -73,7 +72,7 @@ class ModuleCache: clear = staticmethod(cache.clear) @classmethod - def load(cls, key_func, mod_func, *args) -> Tuple[str, ModuleType]: + def load(cls, key_func, mod_func, *args) -> tuple[str, ModuleType]: key = key_func(*args) if key not in cls.cache: func_name, mod = mod_func(*args) diff --git a/orttraining/orttraining/python/training/ort_triton/_codegen.py b/orttraining/orttraining/python/training/ort_triton/_codegen.py index c6759630b2777..548b415ea990e 100644 --- a/orttraining/orttraining/python/training/ort_triton/_codegen.py +++ b/orttraining/orttraining/python/training/ort_triton/_codegen.py @@ -12,8 +12,6 @@ """ -from typing import Tuple - import sympy import torch from sympy.codegen.rewriting import create_expand_pow_optimization @@ -49,7 +47,7 @@ def codegen(self, node: IRNode, context: CodegenContext, code_buffer: CodeBuffer assert func is not None, f"unimplemented node: {node.__class__.__name__}" func(node, context, code_buffer, indent) - def _get_elementwise_offset_mask(self, offset_calc: OffsetCalculator, arg_name: str) -> Tuple[str, str]: + def _get_elementwise_offset_mask(self, offset_calc: OffsetCalculator, arg_name: str) -> tuple[str, str]: if offset_calc.is_x_reduced(arg_name): # Scalar. return "tl.full([1], 0, tl.int32)", "" @@ -61,7 +59,7 @@ def _get_elementwise_offset_mask(self, offset_calc: OffsetCalculator, arg_name: offset_str = str(expand_opt(sympy_dot(parse_shape(idx_var), strides))) return offset_str, "xmask" if offset_calc.requires_x_mask else "" - def _get_reduce_offset_mask(self, offset_calc: OffsetCalculator, arg_name: str) -> Tuple[str, str]: + def _get_reduce_offset_mask(self, offset_calc: OffsetCalculator, arg_name: str) -> tuple[str, str]: offset_strs = [] mask_strs = [] if not offset_calc.is_x_reduced(arg_name): @@ -93,7 +91,7 @@ def _get_reduce_offset_mask(self, offset_calc: OffsetCalculator, arg_name: str) offset_strs.append("tl.full([1, 1], 0, tl.int32)") return " + ".join(offset_strs), " & ".join(mask_strs) - def _get_offset_mask(self, offset_calc: OffsetCalculator, arg_name: str) -> Tuple[str, str]: + def _get_offset_mask(self, offset_calc: OffsetCalculator, arg_name: str) -> tuple[str, str]: return ( self._get_reduce_offset_mask(offset_calc, arg_name) if offset_calc.is_reduction diff --git a/orttraining/orttraining/python/training/ort_triton/_common.py b/orttraining/orttraining/python/training/ort_triton/_common.py index a1c3d7d7e1d4f..420c02f4c4385 100644 --- a/orttraining/orttraining/python/training/ort_triton/_common.py +++ b/orttraining/orttraining/python/training/ort_triton/_common.py @@ -4,7 +4,7 @@ # -------------------------------------------------------------------------- from abc import abstractmethod -from typing import Any, Dict, List, Tuple +from typing import Any import sympy from onnx import GraphProto, NodeProto, TensorProto @@ -12,7 +12,7 @@ from ._sympy_utils import extract_shape_from_symbol from ._utils import get_attribute, get_reduce_info, next_power_of_2 -_SPECIAL_FLOATS: List[str] = ["inf", "-inf"] +_SPECIAL_FLOATS: list[str] = ["inf", "-inf"] class CodegenContext: @@ -20,8 +20,8 @@ class CodegenContext: record variable name mapping in term of IRnodes. """ - def __init__(self, var_map: Dict[str, str]): - self._var_map: Dict[str, str] = {**var_map} + def __init__(self, var_map: dict[str, str]): + self._var_map: dict[str, str] = {**var_map} # Get variable name by the node arg name in ONNX graph. def get_variable_name(self, name: str) -> str: @@ -36,7 +36,7 @@ def get_internal_variable_name(self, name: str) -> str: class CodeBuffer: def __init__(self): - self.buffer: List[str] = [] + self.buffer: list[str] = [] def __iadd__(self, other: str): self.buffer.append(other) @@ -59,7 +59,7 @@ class SymbolicDSU: """ def __init__(self): - self._dsu: Dict[sympy.Expr, sympy.Expr] = {} + self._dsu: dict[sympy.Expr, sympy.Expr] = {} def find(self, symbolic: sympy.Expr) -> sympy.Expr: if symbolic not in self._dsu: @@ -81,25 +81,25 @@ class TensorInfo: Represent a input/output tensor of a node. """ - def __init__(self, dtype: TensorProto.DataType, shape: List[sympy.Expr]): + def __init__(self, dtype: TensorProto.DataType, shape: list[sympy.Expr]): self._dtype: TensorProto.DataType = dtype - self._shape: List[sympy.Expr] = shape + self._shape: list[sympy.Expr] = shape @property def dtype(self) -> TensorProto.DataType: return self._dtype @property - def shape(self) -> List[sympy.Expr]: + def shape(self) -> list[sympy.Expr]: return self._shape def update_shape(self, symbolics: SymbolicDSU): self._shape = [symbolics.find(dim) if dim.is_symbol else dim for dim in self._shape] -def _infer_elementwise_shape(input_infos: List[TensorInfo], symbolics: SymbolicDSU) -> List[sympy.Expr]: +def _infer_elementwise_shape(input_infos: list[TensorInfo], symbolics: SymbolicDSU) -> list[sympy.Expr]: max_len = max([len(input_info.shape) for input_info in input_infos]) - output_shape: List[sympy.Expr] = [sympy.Integer(1)] * max_len + output_shape: list[sympy.Expr] = [sympy.Integer(1)] * max_len for input_info in input_infos: offset = max_len - len(input_info.shape) for idx, dim in enumerate(input_info.shape): @@ -112,22 +112,22 @@ def _infer_elementwise_shape(input_infos: List[TensorInfo], symbolics: SymbolicD def _infer_elementwise( - node: NodeProto, input_infos: List[TensorInfo], graph: GraphProto, symbolics: SymbolicDSU -) -> List[TensorInfo]: + node: NodeProto, input_infos: list[TensorInfo], graph: GraphProto, symbolics: SymbolicDSU +) -> list[TensorInfo]: # pylint: disable=unused-argument return [TensorInfo(input_infos[0].dtype, _infer_elementwise_shape(input_infos, symbolics))] def _infer_where( - node: NodeProto, input_infos: List[TensorInfo], graph: GraphProto, symbolics: SymbolicDSU -) -> List[TensorInfo]: + node: NodeProto, input_infos: list[TensorInfo], graph: GraphProto, symbolics: SymbolicDSU +) -> list[TensorInfo]: # pylint: disable=unused-argument return [TensorInfo(input_infos[1].dtype, _infer_elementwise_shape(input_infos, symbolics))] def _infer_reduction( - node: NodeProto, input_infos: List[TensorInfo], graph: GraphProto, symbolics: SymbolicDSU -) -> List[TensorInfo]: + node: NodeProto, input_infos: list[TensorInfo], graph: GraphProto, symbolics: SymbolicDSU +) -> list[TensorInfo]: # pylint: disable=unused-argument input_rank = len(input_infos[0].shape) keep_dims, axes = get_reduce_info(node, graph, input_rank) @@ -141,15 +141,15 @@ def _infer_reduction( def _infer_unary( - node: NodeProto, input_infos: List[TensorInfo], graph: GraphProto, symbolics: SymbolicDSU -) -> List[TensorInfo]: + node: NodeProto, input_infos: list[TensorInfo], graph: GraphProto, symbolics: SymbolicDSU +) -> list[TensorInfo]: # pylint: disable=unused-argument return [input_infos[0]] def _infer_cast( - node: NodeProto, input_infos: List[TensorInfo], graph: GraphProto, symbolics: SymbolicDSU -) -> List[TensorInfo]: + node: NodeProto, input_infos: list[TensorInfo], graph: GraphProto, symbolics: SymbolicDSU +) -> list[TensorInfo]: # pylint: disable=unused-argument dtype = get_attribute(node, "to", TensorProto.UNDEFINED) assert dtype != TensorProto.UNDEFINED @@ -157,8 +157,8 @@ def _infer_cast( def _infer_dropout( - node: NodeProto, input_infos: List[TensorInfo], graph: GraphProto, symbolics: SymbolicDSU -) -> List[TensorInfo]: + node: NodeProto, input_infos: list[TensorInfo], graph: GraphProto, symbolics: SymbolicDSU +) -> list[TensorInfo]: # pylint: disable=unused-argument return [input_infos[0], TensorInfo(TensorProto.BOOL, input_infos[0].shape)] @@ -190,8 +190,8 @@ class TypeAndShapeInfer: @classmethod def infer( - cls, node: NodeProto, input_infos: List[TensorInfo], graph: GraphProto, symbolics: SymbolicDSU - ) -> List[TensorInfo]: + cls, node: NodeProto, input_infos: list[TensorInfo], graph: GraphProto, symbolics: SymbolicDSU + ) -> list[TensorInfo]: if node.op_type not in cls._INFER_FUNC_MAP: raise NotImplementedError(f"Unsupported op type: {node.op_type}") return cls._INFER_FUNC_MAP[node.op_type](node, input_infos, graph, symbolics) @@ -224,7 +224,7 @@ def __init__(self, x_numel: sympy.Expr, r_numel: sympy.Expr, contiguous: bool): ) ) ) - self.configs: List[Tuple[int, int, int]] = self._gen_autotune_configs(x_numel_int, r_numel_int, contiguous) + self.configs: list[tuple[int, int, int]] = self._gen_autotune_configs(x_numel_int, r_numel_int, contiguous) # If there is symbolic shape, we will not tune the kernel. if not x_numel.is_number or not r_numel.is_number: self.configs = self.configs[-1:] @@ -233,13 +233,13 @@ def __init__(self, x_numel: sympy.Expr, r_numel: sympy.Expr, contiguous: bool): def _num_warps(self, x: int, r: int) -> int: return min(max(x * r // 256, 2), 8) - def _gen_config(self, xnp2: int, rnp2: int, x: int, r: int) -> Tuple[int, int, int]: + def _gen_config(self, xnp2: int, rnp2: int, x: int, r: int) -> tuple[int, int, int]: x = min(x, xnp2) r = min(r, rnp2) return x, r, self._num_warps(x, r) # TODO: we need to tune more kernels to get more reasonable configs for better performance. - def _gen_autotune_configs(self, x_numel: int, r_numel: int, contiguous: bool) -> List[Tuple[int, int, int]]: + def _gen_autotune_configs(self, x_numel: int, r_numel: int, contiguous: bool) -> list[tuple[int, int, int]]: configs = [] xnp2 = next_power_of_2(x_numel) if r_numel == 1: diff --git a/orttraining/orttraining/python/training/ort_triton/_decompose.py b/orttraining/orttraining/python/training/ort_triton/_decompose.py index c1ded3975d3a6..601ab03847e72 100644 --- a/orttraining/orttraining/python/training/ort_triton/_decompose.py +++ b/orttraining/orttraining/python/training/ort_triton/_decompose.py @@ -8,8 +8,6 @@ "simple ops" can be executed in one pass """ -from typing import List - import sympy from onnx import GraphProto, NodeProto, TensorProto, helper @@ -30,7 +28,7 @@ class DecomposeDispatch: def __init__(self): self.count = 0 - def __call__(self, node: NodeProto, graph: GraphProto, **kwargs) -> List[NodeProto]: + def __call__(self, node: NodeProto, graph: GraphProto, **kwargs) -> list[NodeProto]: op_type = node.op_type if not hasattr(self, op_type): raise NotImplementedError(f"Not implemented for op type: {op_type}") diff --git a/orttraining/orttraining/python/training/ort_triton/_ir.py b/orttraining/orttraining/python/training/ort_triton/_ir.py index 23abb082c2217..f43e424493b2c 100644 --- a/orttraining/orttraining/python/training/ort_triton/_ir.py +++ b/orttraining/orttraining/python/training/ort_triton/_ir.py @@ -5,7 +5,7 @@ from abc import abstractmethod from collections import defaultdict -from typing import Any, Dict, List, Optional, Set, Tuple +from typing import Any import sympy import torch @@ -22,16 +22,16 @@ class TensorArg: If it's constant (initializer or constant node), it also contains the data in numpy array. """ - def __init__(self, name: str, tensor_info: Optional[TensorInfo] = None, data: Optional[torch.Tensor] = None): + def __init__(self, name: str, tensor_info: TensorInfo | None = None, data: torch.Tensor | None = None): self._name: str = name - self._data: Optional[torch.Tensor] = data + self._data: torch.Tensor | None = data if data is not None: self._dtype: torch.dtype = data.dtype - self._shape: List[sympy.Expr] = parse_shape(list(data.shape)) + self._shape: list[sympy.Expr] = parse_shape(list(data.shape)) else: assert tensor_info is not None self._dtype: torch.dtype = to_torch_dtype(tensor_info.dtype) - self._shape: List[sympy.Expr] = tensor_info.shape + self._shape: list[sympy.Expr] = tensor_info.shape self.cross_kernels: bool = False @property @@ -43,11 +43,11 @@ def dtype(self) -> torch.dtype: return self._dtype @property - def shape(self) -> List[sympy.Expr]: + def shape(self) -> list[sympy.Expr]: return self._shape @property - def data(self) -> Optional[torch.Tensor]: + def data(self) -> torch.Tensor | None: return self._data @@ -61,18 +61,18 @@ class OffsetCalculator: If a reduce node has non-contiguous axes, need to decompose it into multiple reduce nodes before code-gen. """ - def __init__(self, target_shape: List[sympy.Expr], reduce_axes: List[int]): - self.target_shape: List[sympy.Expr] = target_shape + def __init__(self, target_shape: list[sympy.Expr], reduce_axes: list[int]): + self.target_shape: list[sympy.Expr] = target_shape self.is_reduction: bool = len(reduce_axes) > 0 self.rank = len(target_shape) self.reduce_axes = sort_reduce_axes(reduce_axes, self.rank) - self.x_dims: List[sympy.Expr] = [target_shape[dim] for dim in range(self.rank) if dim not in self.reduce_axes] + self.x_dims: list[sympy.Expr] = [target_shape[dim] for dim in range(self.rank) if dim not in self.reduce_axes] self.x_rank: int = len(self.x_dims) self.x_numel: sympy.Expr = sympy.prod(self.x_dims) if self.x_rank > 0 else sympy.Integer(1) - self.r_dims: List[sympy.Expr] = [target_shape[dim] for dim in self.reduce_axes] + self.r_dims: list[sympy.Expr] = [target_shape[dim] for dim in self.reduce_axes] self.r_rank: int = len(self.r_dims) self.r_numel: sympy.Expr = sympy.prod(self.r_dims) if self.r_rank > 0 else sympy.Integer(1) - self.x_strides: List[sympy.Expr] = [] + self.x_strides: list[sympy.Expr] = [] if self.x_rank > 0: self.x_strides.append(sympy.Integer(1)) for i in range(self.x_rank - 2, -1, -1): @@ -80,14 +80,14 @@ def __init__(self, target_shape: List[sympy.Expr], reduce_axes: List[int]): # To avoid generating useless code for offset calculation, we use x_compute_dims and r_compute_dims to # track the dimensions that need to be computed in the offset calculation. These 2 sets will be set in # register_tensor_arg function below. - self.x_compute_dims: Set[int] = set() - self.r_strides: List[sympy.Expr] = [] + self.x_compute_dims: set[int] = set() + self.r_strides: list[sympy.Expr] = [] if self.r_rank > 0: self.r_strides.append(sympy.Integer(1)) for i in range(self.r_rank - 2, -1, -1): self.r_strides.insert(0, self.r_strides[0] * self.r_dims[i + 1]) - self.r_compute_dims: Set[int] = set() - self.input_strides: Dict[str, List[sympy.Expr]] = dict() + self.r_compute_dims: set[int] = set() + self.input_strides: dict[str, list[sympy.Expr]] = dict() self.autotune_configs: AutotuneConfigs = AutotuneConfigs( self.x_numel, self.r_numel, not self.is_reduction or self.reduce_axes[-1] == self.rank - 1 ) @@ -99,17 +99,17 @@ def __init__(self, target_shape: List[sympy.Expr], reduce_axes: List[int]): self.requires_r_mask: bool = any( simplified_r_numel % sympy.Integer(config[1]) != 0 for config in self.autotune_configs.configs ) - self.reduced_args: Set[str] = set() - self.symbolic_shape_variables: Set[str] = set() + self.reduced_args: set[str] = set() + self.symbolic_shape_variables: set[str] = set() - def get_input_strides(self, name: str) -> List[sympy.Expr]: + def get_input_strides(self, name: str) -> list[sympy.Expr]: assert name in self.input_strides return self.input_strides[name] - def get_x_input_strides(self, name: str) -> List[sympy.Expr]: + def get_x_input_strides(self, name: str) -> list[sympy.Expr]: return [dim for idx, dim in enumerate(self.get_input_strides(name)) if idx not in self.reduce_axes] - def get_r_input_strides(self, name: str) -> List[sympy.Expr]: + def get_r_input_strides(self, name: str) -> list[sympy.Expr]: return [dim for idx, dim in enumerate(self.get_input_strides(name)) if idx in self.reduce_axes] # Whether the x shape of the tensor argument is contiguous and is same as the target shape. @@ -195,9 +195,9 @@ class IRNode: The base class for all IR nodes. """ - def __init__(self, inputs: List[TensorArg], outputs: List[TensorArg]): - self.inputs: List[TensorArg] = inputs - self.outputs: List[TensorArg] = outputs + def __init__(self, inputs: list[TensorArg], outputs: list[TensorArg]): + self.inputs: list[TensorArg] = inputs + self.outputs: list[TensorArg] = outputs @abstractmethod def codegen(self, visitor: NodeVisitor, context: CodegenContext, code_buffer: CodeBuffer, indent: int = 0): @@ -212,13 +212,13 @@ class ComputeNode(IRNode): def __init__( self, op_type: str, - inputs: List[TensorArg], - outputs: List[TensorArg], - attributes: Dict[str, Any] = {}, # noqa: B006 + inputs: list[TensorArg], + outputs: list[TensorArg], + attributes: dict[str, Any] = {}, # noqa: B006 ): super().__init__(inputs, outputs) self._op_type: str = op_type - self._attributes: Dict[str, Any] = attributes + self._attributes: dict[str, Any] = attributes @property def op_type(self): @@ -230,7 +230,7 @@ def attributes(self): class ReduceNode(ComputeNode): - def __init__(self, op_type: str, inputs: List[TensorArg], outputs: List[TensorArg], offset_calc: OffsetCalculator): + def __init__(self, op_type: str, inputs: list[TensorArg], outputs: list[TensorArg], offset_calc: OffsetCalculator): super().__init__(op_type, inputs, outputs) assert op_type == "ReduceSum" or op_type == "ReduceMax" or op_type == "ReduceMin" self.default_value: str = ( @@ -250,9 +250,9 @@ class ReduceForLoopStart(ComputeNode): shared-memory declaration """ - def __init__(self, reduce_nodes: List[ReduceNode], offset_calc: OffsetCalculator): + def __init__(self, reduce_nodes: list[ReduceNode], offset_calc: OffsetCalculator): super().__init__("", [], []) - self.reduce_nodes: List[ReduceNode] = reduce_nodes + self.reduce_nodes: list[ReduceNode] = reduce_nodes self.offset_calc: OffsetCalculator = offset_calc @@ -261,9 +261,9 @@ class ReduceForLoopEnd(ComputeNode): shared-memory reduction """ - def __init__(self, reduce_nodes: List[ReduceNode], offset_calc: OffsetCalculator): + def __init__(self, reduce_nodes: list[ReduceNode], offset_calc: OffsetCalculator): super().__init__("", [], []) - self.reduce_nodes: List[ReduceNode] = reduce_nodes + self.reduce_nodes: list[ReduceNode] = reduce_nodes self.offset_calc: OffsetCalculator = offset_calc @@ -273,7 +273,7 @@ class DropoutNode(ComputeNode): if there are more than one dropout operators in the subgraph. """ - def __init__(self, inputs: List[TensorArg], outputs: List[TensorArg], offset_calc: OffsetCalculator): + def __init__(self, inputs: list[TensorArg], outputs: list[TensorArg], offset_calc: OffsetCalculator): super().__init__("Dropout", inputs, outputs) self.offset_calc: OffsetCalculator = offset_calc self.offset_calc.register_tensor_arg(inputs[0]) @@ -301,14 +301,14 @@ class KernelNode(IRNode): """ - def __init__(self, inputs: List[TensorArg], outputs: List[TensorArg], target_shape: List, reduce_axes: List[int]): + def __init__(self, inputs: list[TensorArg], outputs: list[TensorArg], target_shape: list, reduce_axes: list[int]): super().__init__(inputs, outputs) self.name: str = gen_unique_name("triton") - self.internal_args: Set[str] = set() - self.constants: Dict[str, TensorArg] = dict() - self.target_shape: List[sympy.Expr] = target_shape - self.sub_nodes: List[IRNode] = [] - self.var_map: Dict[str, str] = dict() + self.internal_args: set[str] = set() + self.constants: dict[str, TensorArg] = dict() + self.target_shape: list[sympy.Expr] = target_shape + self.sub_nodes: list[IRNode] = [] + self.var_map: dict[str, str] = dict() self.has_dropout: bool = False self.offset_calc: OffsetCalculator = OffsetCalculator(target_shape, reduce_axes) @@ -335,18 +335,18 @@ def gen_variable_names(self): class ElementwiseKernelNode(KernelNode): - def __init__(self, inputs: List[TensorArg], outputs: List[TensorArg], target_shape: List[sympy.Expr]): + def __init__(self, inputs: list[TensorArg], outputs: list[TensorArg], target_shape: list[sympy.Expr]): super().__init__(inputs, outputs, target_shape, []) class ReduceKernelNode(KernelNode): def __init__( self, - inputs: List[TensorArg], - outputs: List[TensorArg], - target_shape: List[sympy.Expr], - reduce_axes: List[int], - reduced_args: Set[str], + inputs: list[TensorArg], + outputs: list[TensorArg], + target_shape: list[sympy.Expr], + reduce_axes: list[int], + reduced_args: set[str], ): super().__init__(inputs, outputs, target_shape, reduce_axes) self.offset_calc.reduced_args.update(reduced_args) @@ -361,18 +361,18 @@ class ModuleNode(IRNode): def __init__( self, func_name: str, - inputs: List[TensorArg], - outputs: List[TensorArg], - constants: List[TensorArg], - cross_kernel_args: List[Tuple[TensorArg, int]], - kernels: List[KernelNode], + inputs: list[TensorArg], + outputs: list[TensorArg], + constants: list[TensorArg], + cross_kernel_args: list[tuple[TensorArg, int]], + kernels: list[KernelNode], ): super().__init__(inputs, outputs) self.func_name: str = func_name # Currently need inputs and outputs only. May need intermediate vars and constants later. - self.constants: List[TensorArg] = constants - self.kernels: List[KernelNode] = kernels - self.var_map: Dict[str, str] = dict() + self.constants: list[TensorArg] = constants + self.kernels: list[KernelNode] = kernels + self.var_map: dict[str, str] = dict() existing_names = set() for input in self.inputs: name = gen_variable_name(input.name, "in", existing_names) @@ -380,7 +380,7 @@ def __init__( for output in self.outputs: name = gen_variable_name(output.name, "out", existing_names) self.var_map[output.name] = name - self.cross_kernel_args_to_delete: Dict[int, Set[str]] = defaultdict(set) + self.cross_kernel_args_to_delete: dict[int, set[str]] = defaultdict(set) for pair in cross_kernel_args: name = gen_variable_name(pair[0].name, "buf", existing_names) self.cross_kernel_args_to_delete[pair[1]].add(name) diff --git a/orttraining/orttraining/python/training/ort_triton/_lowering.py b/orttraining/orttraining/python/training/ort_triton/_lowering.py index 7253c7935a650..642f2a02ede6f 100644 --- a/orttraining/orttraining/python/training/ort_triton/_lowering.py +++ b/orttraining/orttraining/python/training/ort_triton/_lowering.py @@ -6,7 +6,7 @@ import itertools import warnings from collections import defaultdict -from typing import Any, Dict, List, Set, Tuple +from typing import Any import sympy from onnx import NodeProto, helper @@ -37,31 +37,31 @@ class NodeGroup: """ - def __init__(self, node: NodeProto, reduce_axes: List[int], keep_dims: int, node_arg_infos: Dict[str, TensorInfo]): + def __init__(self, node: NodeProto, reduce_axes: list[int], keep_dims: int, node_arg_infos: dict[str, TensorInfo]): self._node_arg_infos = node_arg_infos - self.nodes_groups: List[Any] = [node] - self.target_shape: List[sympy.Expr] = self._get_target_shape(node) + self.nodes_groups: list[Any] = [node] + self.target_shape: list[sympy.Expr] = self._get_target_shape(node) rank = len(self.target_shape) - self.reduce_axes: List[int] = sort_reduce_axes(reduce_axes, rank) + self.reduce_axes: list[int] = sort_reduce_axes(reduce_axes, rank) x_dims = [self.target_shape[dim] for dim in range(rank) if dim not in self.reduce_axes] # x_numel is meant to hint how many rows of tensor will be processed by each kernel. # x is same as CUDA block in X direction. x_numel: sympy.Expr = sympy.prod(x_dims) if len(x_dims) > 0 else sympy.Integer(1) - r_dims: List[sympy.Expr] = [self.target_shape[dim] for dim in self.reduce_axes] + r_dims: list[sympy.Expr] = [self.target_shape[dim] for dim in self.reduce_axes] # r_numel is meant to hint how many elements in a row of tensor will be processed by each kernel. # r is a abbreviation of reduction, so, it's only used for reduction nodes. r_numel: sympy.Expr = sympy.prod(r_dims) if len(r_dims) > 0 else sympy.Integer(1) self.autotune_configs: AutotuneConfigs = AutotuneConfigs( x_numel, r_numel, len(self.reduce_axes) == 0 or self.reduce_axes[-1] == rank - 1 ) - self.reduced_args: Set[str] = set() + self.reduced_args: set[str] = set() if keep_dims != 1: self.reduced_args.add(node.output[0]) # Check if shape can be broadcasted to target_shape. # For example, [1, 3, 1, 1] can be broadcasted to [1, 3, 5, 7]. # and we support `keepdims = false``, so [1, 3, 5, 7] is compatible with [1, 3, 5]. - def _compatible_shape(self, shape: List[sympy.Expr], split_if_different: bool) -> bool: + def _compatible_shape(self, shape: list[sympy.Expr], split_if_different: bool) -> bool: if split_if_different: return shape == self.target_shape if len(shape) > len(self.target_shape): @@ -88,7 +88,7 @@ def _get_target_shape(self, node): # 2. The target shape of a group is determined by the first node in the group. # we call it dominators, and it determinate the partition strategy of X_numel/R_numel. # A group can't have multiple dominators. - def compatible(self, node: NodeProto, reduce_axes: List[int], keep_dims: int, split_if_different: bool) -> bool: + def compatible(self, node: NodeProto, reduce_axes: list[int], keep_dims: int, split_if_different: bool) -> bool: target_shape = self._get_target_shape(node) if is_reduction_node(node): # If the following nodes are all elementwise nodes on reduce output shape. @@ -105,7 +105,7 @@ def compatible(self, node: NodeProto, reduce_axes: List[int], keep_dims: int, sp # 1. Create a new group with the reduction node. # 2. Add this node to the current group. - def add_node(self, node: NodeProto, reduce_axes: List[int], keep_dims: int): + def add_node(self, node: NodeProto, reduce_axes: list[int], keep_dims: int): if is_reduction_node(node): group = NodeGroup(node, reduce_axes, keep_dims, self._node_arg_infos) self.nodes_groups.append(group) @@ -142,7 +142,7 @@ def dependent_nodes(self, keep_reduce_node: bool): return node_map, reduce_nodes # finalize the group, and return the flatten nodes - def flatten(self, sorted_nodes: List[NodeProto]) -> Tuple[List[NodeProto], List[List[int]]]: + def flatten(self, sorted_nodes: list[NodeProto]) -> tuple[list[NodeProto], list[list[int]]]: if self.autotune_configs.requires_for_loop: layers = [] group_layer = [self] @@ -193,12 +193,12 @@ class KernelIO: """ def __init__(self): - self.module_inputs: List[str] = [] - self.cross_kernel_inputs: List[str] = [] - self.constants: List[str] = [] - self.module_outputs: List[str] = [] - self.cross_kernel_outputs: List[str] = [] - self.internal_args: List[str] = [] + self.module_inputs: list[str] = [] + self.cross_kernel_inputs: list[str] = [] + self.constants: list[str] = [] + self.module_outputs: list[str] = [] + self.cross_kernel_outputs: list[str] = [] + self.internal_args: list[str] = [] class GraphLowering: @@ -217,24 +217,24 @@ class GraphLowering: def __init__(self, sorted_graph: SortedGraph): self._sorted_graph: SortedGraph = sorted_graph - self._node_arg_infos: Dict[str, TensorInfo] = sorted_graph.node_arg_infos - self._module_inputs: List[TensorArg] = [] - self._module_outputs: List[TensorArg] = [] - self._module_constants: List[TensorArg] = [] - self._module_input_names: Set[str] = set() - self._module_output_names: Set[str] = set() - self._module_constant_names: Set[str] = set() - self._tensor_args: Dict[str, TensorArg] = {} + self._node_arg_infos: dict[str, TensorInfo] = sorted_graph.node_arg_infos + self._module_inputs: list[TensorArg] = [] + self._module_outputs: list[TensorArg] = [] + self._module_constants: list[TensorArg] = [] + self._module_input_names: set[str] = set() + self._module_output_names: set[str] = set() + self._module_constant_names: set[str] = set() + self._tensor_args: dict[str, TensorArg] = {} # Extract module inputs, outputs and constants. self._extract_module_io() # Group nodes into NodeGroups, each NodeGroup represents a kernel. - self._groups: List[NodeGroup] = [] + self._groups: list[NodeGroup] = [] self._group_nodes() # Convert NodeGroups to KernelNodes. - self._kernel_nodes: List[KernelNode] = [] - self._kernel_io_list: List[KernelIO] = [] + self._kernel_nodes: list[KernelNode] = [] + self._kernel_io_list: list[KernelIO] = [] self._lower() # A module is map to a real onnx graph. @@ -256,12 +256,12 @@ def _extract_module_io(self): for arg in itertools.chain(self._module_inputs, self._module_outputs, self._module_constants) ) - def _get_reduce_info(self, node) -> Tuple[int, List[int]]: + def _get_reduce_info(self, node) -> tuple[int, list[int]]: assert is_reduction_node(node) input_rank = len(self._node_arg_infos[node.input[0]].shape) return get_reduce_info(node, self._sorted_graph.original_graph, input_rank) - def _process_node(self, node: NodeProto, precessors: Dict[str, List[NodeProto]], group: NodeGroup): + def _process_node(self, node: NodeProto, precessors: dict[str, list[NodeProto]], group: NodeGroup): dependent_nodes = set() dependent_nodes.add(node.name) for precessor in precessors[node.name]: @@ -328,7 +328,7 @@ def _group_nodes(self): self._groups.append(group_i) flag.add(i) - def _get_node_io(self, node: NodeProto) -> Tuple[List[TensorArg], List[TensorArg]]: + def _get_node_io(self, node: NodeProto) -> tuple[list[TensorArg], list[TensorArg]]: input_args = [] for input in node.input: if input in self._tensor_args: @@ -345,7 +345,7 @@ def _get_node_io(self, node: NodeProto) -> Tuple[List[TensorArg], List[TensorArg self._tensor_args[output] = output_args[-1] return input_args, output_args - def _extract_kernel_io(self, nodes: List[NodeProto]) -> KernelIO: + def _extract_kernel_io(self, nodes: list[NodeProto]) -> KernelIO: kernel_io = KernelIO() input_set = set() output_set = set() diff --git a/orttraining/orttraining/python/training/ort_triton/_sorted_graph.py b/orttraining/orttraining/python/training/ort_triton/_sorted_graph.py index d67a1c1665200..722f05dfdf493 100644 --- a/orttraining/orttraining/python/training/ort_triton/_sorted_graph.py +++ b/orttraining/orttraining/python/training/ort_triton/_sorted_graph.py @@ -5,7 +5,6 @@ import copy import itertools -from typing import Dict, List, Set import onnx import sympy @@ -30,14 +29,14 @@ class SortedGraph: input_shapes: the shapes of the model inputs. Can be numeric values or symbolic values. """ - def __init__(self, model: ModelProto, input_shapes: List[List[sympy.Expr]]): + def __init__(self, model: ModelProto, input_shapes: list[list[sympy.Expr]]): self._model: ModelProto = model self._graph: GraphProto = model.graph - self._input_shapes: List[List[sympy.Expr]] = input_shapes + self._input_shapes: list[list[sympy.Expr]] = input_shapes # For elementwise graph outputs, when we group nodes to different kernels, if the target shape is different # from other nodes' target shape, even it can be broadcasted, we still need to create a new kernel for it. - self._elementwise_graph_outputs: Set[str] = set() + self._elementwise_graph_outputs: set[str] = set() graph_output_names = [output.name for output in self._graph.output] for node in self._graph.node: if is_elementwise_node(node): @@ -46,12 +45,12 @@ def __init__(self, model: ModelProto, input_shapes: List[List[sympy.Expr]]): ) # Topological sort the nodes in the graph. - self._sorted_nodes: List[NodeProto] = topological_sort( + self._sorted_nodes: list[NodeProto] = topological_sort( [input.name for input in self._graph.input] + [initializer.name for initializer in self._graph.initializer], self._graph.node, ) - self._node_arg_infos: Dict[str, TensorInfo] = {} + self._node_arg_infos: dict[str, TensorInfo] = {} for idx, input in enumerate(self._graph.input): self._node_arg_infos[input.name] = TensorInfo(input.type.tensor_type.elem_type, self._input_shapes[idx]) for initializer in self._graph.initializer: @@ -70,7 +69,7 @@ def __init__(self, model: ModelProto, input_shapes: List[List[sympy.Expr]]): initializers = {} for initializer in self._graph.initializer: initializers[initializer.name] = initializer - self._sorted_initializers: List[TensorProto] = [] + self._sorted_initializers: list[TensorProto] = [] for node in self._sorted_nodes: for input in node.input: if input in initializers: @@ -78,8 +77,8 @@ def __init__(self, model: ModelProto, input_shapes: List[List[sympy.Expr]]): initializers.pop(input) # Split nodes to constant nodes and non-constant nodes. - self._const_nodes: List[NodeProto] = [node for node in self._sorted_nodes if node.op_type == "Constant"] - self._sorted_nodes: List[NodeProto] = [node for node in self._sorted_nodes if node.op_type != "Constant"] + self._const_nodes: list[NodeProto] = [node for node in self._sorted_nodes if node.op_type == "Constant"] + self._sorted_nodes: list[NodeProto] = [node for node in self._sorted_nodes if node.op_type != "Constant"] def __str__(self): """ @@ -140,11 +139,11 @@ def __eq__(self, other): return str(self) == str(other) @property - def const_nodes(self) -> List[NodeProto]: + def const_nodes(self) -> list[NodeProto]: return self._const_nodes @property - def sorted_nodes(self) -> List[NodeProto]: + def sorted_nodes(self) -> list[NodeProto]: return self._sorted_nodes @property @@ -152,11 +151,11 @@ def original_graph(self) -> GraphProto: return self._graph @property - def node_arg_infos(self) -> Dict[str, TensorInfo]: + def node_arg_infos(self) -> dict[str, TensorInfo]: return self._node_arg_infos @property - def elementwise_graph_outputs(self) -> Set[str]: + def elementwise_graph_outputs(self) -> set[str]: return self._elementwise_graph_outputs def _decompose(self): diff --git a/orttraining/orttraining/python/training/ort_triton/_sympy_utils.py b/orttraining/orttraining/python/training/ort_triton/_sympy_utils.py index a4a384c021fe8..1df587fda054e 100644 --- a/orttraining/orttraining/python/training/ort_triton/_sympy_utils.py +++ b/orttraining/orttraining/python/training/ort_triton/_sympy_utils.py @@ -4,7 +4,7 @@ # -------------------------------------------------------------------------- import re -from typing import Any, List +from typing import Any import sympy @@ -15,12 +15,12 @@ def extract_shape_from_symbol(symbol: str) -> int: return int(match.group(3)) -def sympy_dot(seq1: List[sympy.Expr], seq2: List[sympy.Expr]) -> sympy.Expr: +def sympy_dot(seq1: list[sympy.Expr], seq2: list[sympy.Expr]) -> sympy.Expr: assert len(seq1) == len(seq2) - return sympy.expand(sum(a * b for a, b in zip(seq1, seq2))) + return sympy.expand(sum(a * b for a, b in zip(seq1, seq2, strict=False))) -def parse_shape(shape: List[Any]) -> List[sympy.Expr]: +def parse_shape(shape: list[Any]) -> list[sympy.Expr]: symbol_shapes = [] for dim in shape: symbol_dim = dim diff --git a/orttraining/orttraining/python/training/ort_triton/_utils.py b/orttraining/orttraining/python/training/ort_triton/_utils.py index e39a668bd0066..3cf5cfa184861 100644 --- a/orttraining/orttraining/python/training/ort_triton/_utils.py +++ b/orttraining/orttraining/python/training/ort_triton/_utils.py @@ -6,7 +6,7 @@ import re import uuid from collections import defaultdict -from typing import Any, List, Tuple +from typing import Any import numpy as np import torch @@ -27,7 +27,7 @@ def _topological_sort_internal(node, visited, output_consumers, sorted_nodes): # Topological sort of nodes given the input names. The list of nodes contain both constant and non-constant nodes. -def topological_sort(inputs: List[str], nodes: List[NodeProto]) -> List[NodeProto]: +def topological_sort(inputs: list[str], nodes: list[NodeProto]) -> list[NodeProto]: const_nodes = [] non_const_nodes = [] for node in nodes: @@ -119,7 +119,7 @@ def may_add_brackets(name: str) -> str: return name -def sort_reduce_axes(axes: List[int], rank: int, check_contiguous: bool = True) -> List[int]: +def sort_reduce_axes(axes: list[int], rank: int, check_contiguous: bool = True) -> list[int]: axes = [axis + rank if axis < 0 else axis for axis in axes] axes.sort() if check_contiguous: @@ -129,7 +129,7 @@ def sort_reduce_axes(axes: List[int], rank: int, check_contiguous: bool = True) # Get the keep_dims attribute and reduce axes from a reduce node. -def get_reduce_info(node: NodeProto, graph: GraphProto, input_rank: int) -> Tuple[int, List[int]]: +def get_reduce_info(node: NodeProto, graph: GraphProto, input_rank: int) -> tuple[int, list[int]]: keep_dims = get_attribute(node, "keepdims", 1) noop_with_empty_axes = get_attribute(node, "noop_with_empty_axes", 0) axes = get_attribute(node, "axes", None) diff --git a/orttraining/orttraining/python/training/ort_triton/kernel/_flash_attn.py b/orttraining/orttraining/python/training/ort_triton/kernel/_flash_attn.py index 3850d988ef473..67394fe297d51 100644 --- a/orttraining/orttraining/python/training/ort_triton/kernel/_flash_attn.py +++ b/orttraining/orttraining/python/training/ort_triton/kernel/_flash_attn.py @@ -40,7 +40,6 @@ """ import math -from typing import List, Tuple import torch import triton @@ -1009,7 +1008,7 @@ def _make_flash_attention_nodes( # Without causal mask, without Dropout. For example, BERT model in HuggingFace. -_PATTERN_0: List[Tuple[str, bool, List[Tuple[int, int, int]]]] = [ +_PATTERN_0: list[tuple[str, bool, list[tuple[int, int, int]]]] = [ ("MatMul", False, []), # 0 ("Transpose", True, [(0, 0, 0)]), # 1 ("Transpose", True, [(0, 0, 1)]), # 2 @@ -1034,7 +1033,7 @@ def _make_flash_attention_nodes( ] -def _optimize_for_pattern_0(matcher: GraphMatcher, idx: int, nodes: List[NodeProto]): +def _optimize_for_pattern_0(matcher: GraphMatcher, idx: int, nodes: list[NodeProto]): # Check forward only as the backward is expected to be consistent if it's built correctly. scale_value = matcher.get_constant_value(nodes[3].input[1]) if not ( @@ -1063,7 +1062,7 @@ def _optimize_for_pattern_0(matcher: GraphMatcher, idx: int, nodes: List[NodePro # llama2+peft, k doesn't require grad. -_PATTERN_1: List[Tuple[str, bool, List[Tuple[int, int, int]]]] = [ +_PATTERN_1: list[tuple[str, bool, list[tuple[int, int, int]]]] = [ ("MatMul", False, []), # 0 ("Transpose", True, [(0, 0, 1)]), # 1 ("Div", False, [(0, 0, 0)]), # 2 @@ -1087,7 +1086,7 @@ def _optimize_for_pattern_0(matcher: GraphMatcher, idx: int, nodes: List[NodePro ] -def _optimize_for_pattern_1(matcher: GraphProto, idx: int, nodes: List[NodeProto]): +def _optimize_for_pattern_1(matcher: GraphProto, idx: int, nodes: list[NodeProto]): # Check forward only as the backward is expected to be consistent if it's built correctly. scale_value = matcher.get_constant_value(nodes[2].input[1]) if not ( @@ -1138,7 +1137,7 @@ def _optimize_for_pattern_1(matcher: GraphProto, idx: int, nodes: List[NodeProto # llama2+peft, k requires grad. -_PATTERN_2: List[Tuple[str, bool, List[Tuple[int, int, int]]]] = [ +_PATTERN_2: list[tuple[str, bool, list[tuple[int, int, int]]]] = [ ("MatMul", False, []), # 0 ("Transpose", True, [(0, 0, 1)]), # 1 ("Div", False, [(0, 0, 0)]), # 2 @@ -1164,7 +1163,7 @@ def _optimize_for_pattern_1(matcher: GraphProto, idx: int, nodes: List[NodeProto ] -def _aptimize_for_pattern_2(matcher: GraphProto, idx: int, nodes: List[NodeProto]): +def _aptimize_for_pattern_2(matcher: GraphProto, idx: int, nodes: list[NodeProto]): # Check forward only as the backward is expected to be consistent if it's built correctly. scale_value = matcher.get_constant_value(nodes[2].input[1]) if not ( diff --git a/orttraining/orttraining/python/training/ort_triton/kernel/_mm.py b/orttraining/orttraining/python/training/ort_triton/kernel/_mm.py index 1a944082fa4ba..dffdac0f34553 100644 --- a/orttraining/orttraining/python/training/ort_triton/kernel/_mm.py +++ b/orttraining/orttraining/python/training/ort_triton/kernel/_mm.py @@ -6,7 +6,6 @@ import math import os from types import ModuleType -from typing import Tuple import torch @@ -310,7 +309,7 @@ def _gen_mm_key(dtype: torch.dtype, m: int, n: int, k: int, trans_a: bool, trans def _gen_mm_module( dtype: torch.dtype, m: int, n: int, k: int, trans_a: bool, trans_b: bool, alpha: float -) -> Tuple[str, ModuleType]: +) -> tuple[str, ModuleType]: func_name = gen_unique_name("mm") kwargs = _mm_configs(dtype, m, n, k, trans_a, trans_b, alpha, func_name) src_code = _MM_TEMPLATE.format(**kwargs) @@ -347,7 +346,7 @@ def _gen_gemm_module( trans_b: bool, alpha: float, beta: float, -) -> Tuple[str, ModuleType]: +) -> tuple[str, ModuleType]: func_name = gen_unique_name("gemm") kwargs = _mm_configs(dtype, m, n, k, trans_a, trans_b, alpha, func_name) kwargs["stride_cm"] = stride_cm @@ -369,7 +368,7 @@ def _gen_bmm_key( def _gen_bmm_module( dtype: torch.dtype, m: int, n: int, k: int, batch_a: int, batch_b: int, trans_a: bool, trans_b: bool, alpha: float -) -> Tuple[str, ModuleType]: +) -> tuple[str, ModuleType]: func_name = gen_unique_name("bmm") kwargs = _mm_configs(dtype, m, n, k, trans_a, trans_b, alpha, func_name) batch = max(batch_a, batch_b) diff --git a/orttraining/orttraining/python/training/ort_triton/triton_op_executor.py b/orttraining/orttraining/python/training/ort_triton/triton_op_executor.py index 14bc2779aa05b..47d220826f73e 100644 --- a/orttraining/orttraining/python/training/ort_triton/triton_op_executor.py +++ b/orttraining/orttraining/python/training/ort_triton/triton_op_executor.py @@ -9,7 +9,6 @@ import re import sys from types import ModuleType -from typing import List, Tuple, Union import onnx from onnx import ModelProto @@ -29,7 +28,7 @@ @functools.lru_cache(None) -def _gen_module_internal(sorted_graph: SortedGraph) -> Tuple[str, str, ModuleType]: +def _gen_module_internal(sorted_graph: SortedGraph) -> tuple[str, str, ModuleType]: func_name = gen_unique_name("func") src_code = codegen(func_name, sorted_graph) return func_name, src_code, PyCodeCache().load(src_code) @@ -58,7 +57,7 @@ def set_symbolic_shape_hint(cls, symbolic_shape_hint_config): cls.symbolic_shape_hint[k] = v @classmethod - def get_shape(cls, onnx_key: int, model: ModelProto, shapes: List[List[int]]) -> List[List[Union[int, str]]]: + def get_shape(cls, onnx_key: int, model: ModelProto, shapes: list[list[int]]) -> list[list[int | str]]: if onnx_key not in cls.cache: if cls.symbolic_shape_hint is not None: for i, input in enumerate(model.graph.input): @@ -90,12 +89,12 @@ def get_shape(cls, onnx_key: int, model: ModelProto, shapes: List[List[int]]) -> return cls.cache[onnx_key] -def _gen_key(onnx_key: int, model: ModelProto, shapes: List[List[Union[int, str]]]) -> int: +def _gen_key(onnx_key: int, model: ModelProto, shapes: list[list[int | str]]) -> int: # pylint: disable=unused-argument return hash(f"{onnx_key}|{str(shapes).replace(' ', '')}") -def _gen_module(onnx_key: int, model: ModelProto, shapes: List[List[Union[int, str]]]) -> Tuple[str, ModuleType]: +def _gen_module(onnx_key: int, model: ModelProto, shapes: list[list[int | str]]) -> tuple[str, ModuleType]: sorted_graph = SortedGraph(model, [parse_shape(shape) for shape in shapes]) if _DEBUG_MODE: os.makedirs(os.path.dirname("triton_debug/"), exist_ok=True) diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py index 3e679c994f4bb..9ac65bde82bf8 100644 --- a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py +++ b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py @@ -223,7 +223,7 @@ def _default_export( assert len(args) == len(cconv), "Number of arguments does not match calling convention" # Encode inputs to torch.autograd.Function. - for i, arg, call_type in zip(range(len(args)), args, cconv): + for i, arg, call_type in zip(range(len(args)), args, cconv, strict=False): if call_type == "d": # Got a tensor variable. tensor_args.append(arg) diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py b/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py index 004e3540c62d6..3762c8995cdb1 100644 --- a/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py +++ b/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py @@ -4,7 +4,7 @@ # -------------------------------------------------------------------------- import os -from typing import Callable +from collections.abc import Callable import torch import torch.onnx.symbolic_helper as sym_help diff --git a/orttraining/orttraining/python/training/ortmodule/_execution_agent.py b/orttraining/orttraining/python/training/ortmodule/_execution_agent.py index 047cd4c59d636..8d64caeec6051 100644 --- a/orttraining/orttraining/python/training/ortmodule/_execution_agent.py +++ b/orttraining/orttraining/python/training/ortmodule/_execution_agent.py @@ -3,7 +3,6 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- -from typing import Tuple import onnxruntime from onnxruntime.capi import _pybind_state as C @@ -166,7 +165,7 @@ def run_backward(self, feeds, fetches, state): def get_serialized_ortmodule_memory_stat( self, memory_optimization_config_file_path: str, recompute_probe_level: str, return_opportunity_table: bool - ) -> Tuple[str, dict]: + ) -> tuple[str, dict]: """ Get serialized memory stats for OrtModule. """ diff --git a/orttraining/orttraining/python/training/ortmodule/_fallback.py b/orttraining/orttraining/python/training/ortmodule/_fallback.py index 6a3793cf0f1fd..24eae3c369efe 100644 --- a/orttraining/orttraining/python/training/ortmodule/_fallback.py +++ b/orttraining/orttraining/python/training/ortmodule/_fallback.py @@ -6,7 +6,6 @@ import os from enum import IntFlag from logging import Logger -from typing import Optional import torch @@ -106,7 +105,7 @@ def __init__(self, pytorch_module: torch.nn.Module, policy: _FallbackPolicy, ret self._logger = logger def handle_exception( - self, exception: Exception, log_level: _logger.LogLevel, override_policy: Optional[_FallbackPolicy] = None + self, exception: Exception, log_level: _logger.LogLevel, override_policy: _FallbackPolicy | None = None ) -> None: """Process incoming `exception` based on the selected `policy` diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py index c1ff62a5faea7..25dfd9c3d43dd 100755 --- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py +++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py @@ -7,7 +7,6 @@ import logging import os from abc import ABC, abstractmethod # noqa: F401 -from typing import Dict, List, Optional, Tuple import onnx import torch @@ -30,7 +29,7 @@ class _RunStateInfo: - def __init__(self, state, output_info: List[Tuple[torch.Size, torch.device, torch.dtype]]): + def __init__(self, state, output_info: list[tuple[torch.Size, torch.device, torch.dtype]]): """ :param state: State of partial run that contains intermediate tensors needed to resume the run later. :param output_info: Output info. @@ -74,7 +73,7 @@ def __init__( self._flattened_module = module self._onnx_models = _onnx_models.ONNXModels() - self._graph_transition_manager: Optional[GraphTransitionManager] = None + self._graph_transition_manager: GraphTransitionManager | None = None # Model after inference optimization and then gradient building. self._graph_builder = None @@ -341,7 +340,7 @@ def _device(self): return self._graph_transition_manager._device @_logger.TrackTime(_logger.ORTModuleInitPhase.DETECTION) - def _detect_from_inputs(self, inputs: Tuple, kwargs: Dict): + def _detect_from_inputs(self, inputs: tuple, kwargs: dict): """ Based on runtime inspection, enable conditional optimizations if applicable. @@ -381,7 +380,7 @@ def _detect_from_inputs(self, inputs: Tuple, kwargs: Dict): [f"{k}:{v:.0f}%" for k, v in self._runtime_inspector._embedding_module_to_padding_density_map.items()] ) - def _append_pull_weight_trigger_as_input(self, kwargs: Dict, device: torch.device): + def _append_pull_weight_trigger_as_input(self, kwargs: dict, device: torch.device): if self._runtime_options.enable_zero_stage3_support: from ._zero_stage3_compatibility import ( STAGE3_PULL_WEIGHT_TRIGGER_NAME, diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager_factory.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager_factory.py index 104cc0a894eed..237aafd6d2c3c 100644 --- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager_factory.py +++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager_factory.py @@ -4,7 +4,6 @@ # -------------------------------------------------------------------------- from logging import Logger -from typing import Union from ._fallback import _FallbackManager from ._inference_manager import InferenceManager @@ -24,7 +23,7 @@ def __init__( self._training_manager = TrainingManager(module, debug_options, fallback_manager, logger) self._inference_manager = InferenceManager(module, debug_options, fallback_manager, logger) - def __call__(self, is_training) -> Union[InferenceManager, TrainingManager]: + def __call__(self, is_training) -> InferenceManager | TrainingManager: if is_training: return self._training_manager else: diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_transition_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_transition_manager.py index bbf271e4e9b74..ba215bd86c5a3 100755 --- a/orttraining/orttraining/python/training/ortmodule/_graph_transition_manager.py +++ b/orttraining/orttraining/python/training/ortmodule/_graph_transition_manager.py @@ -11,9 +11,9 @@ import logging import os from collections import OrderedDict +from collections.abc import Mapping, Sequence from functools import partial from hashlib import md5 as hash_fn -from typing import Mapping, Sequence import onnx import torch diff --git a/orttraining/orttraining/python/training/ortmodule/_inference_manager.py b/orttraining/orttraining/python/training/ortmodule/_inference_manager.py index 61db462ad3bb8..362f1a88ce924 100644 --- a/orttraining/orttraining/python/training/ortmodule/_inference_manager.py +++ b/orttraining/orttraining/python/training/ortmodule/_inference_manager.py @@ -4,7 +4,6 @@ # -------------------------------------------------------------------------- from logging import Logger -from typing import Tuple import onnx import torch @@ -35,7 +34,7 @@ def execution_session_run_forward( onnx_model: onnx.ModelProto, device: torch.device, *inputs, - ) -> Tuple[Tuple[torch.Tensor, ...], _RunStateInfo]: + ) -> tuple[tuple[torch.Tensor, ...], _RunStateInfo]: """Runs the forward pass on `execution_session` with given `onnx_model`, `device` and `inputs` Args: diff --git a/orttraining/orttraining/python/training/ortmodule/_io.py b/orttraining/orttraining/python/training/ortmodule/_io.py index 8ad3d0df3e4fa..f88390130b81f 100644 --- a/orttraining/orttraining/python/training/ortmodule/_io.py +++ b/orttraining/orttraining/python/training/ortmodule/_io.py @@ -7,9 +7,9 @@ import gc import inspect from collections import OrderedDict, abc +from collections.abc import Callable, Mapping, Sequence from functools import partial from logging import Logger -from typing import Callable, Dict, List, Mapping, Optional, Sequence, Tuple import torch @@ -78,7 +78,7 @@ def symbolic(g, self): def deepcopy_model_input( *args, **kwargs -) -> Tuple[Sequence[ORTModelInputOutputType], Mapping[str, ORTModelInputOutputType]]: +) -> tuple[Sequence[ORTModelInputOutputType], Mapping[str, ORTModelInputOutputType]]: def extract_tensor(value): if isinstance(value, torch.Tensor): if value.requires_grad: @@ -101,7 +101,7 @@ def extract_tensor(value): def _extract_schema( data: ORTModelInputOutputType, device -) -> Tuple[Sequence[ORTModelInputOutputType], ORTModelInputOutputSchemaType]: +) -> tuple[Sequence[ORTModelInputOutputType], ORTModelInputOutputSchemaType]: try: flatten_data, schema = extract_data_and_schema(data, constant_as_tensor=True, device=device) return flatten_data, schema @@ -119,15 +119,15 @@ def __init__(self, original_module: torch.nn.Module): # original module's forward function. # So we need set those information that are needed to unflatten the args and kwargs, before calling the # torch.export. - self._device: Optional[torch.device] = None - self._args_schema: Optional[ORTModelInputOutputSchemaType] = None - self._kwargs_schema: Optional[ORTModelInputOutputSchemaType] = None - self._num_positionals: Optional[int] = None + self._device: torch.device | None = None + self._args_schema: ORTModelInputOutputSchemaType | None = None + self._kwargs_schema: ORTModelInputOutputSchemaType | None = None + self._num_positionals: int | None = None # Similarly, to make torch.export happy, we need to flatten the original module's outputs into a 1-D list of tensors. # Need to keep the output schema to unflatten the outputs back to the original structure. # Then those code depends on the original structure of the outputs can work properly. - self._output_schema: Optional[ORTModelInputOutputSchemaType] = None + self._output_schema: ORTModelInputOutputSchemaType | None = None def forward(self, *args): new_args = unflatten_data_using_schema(args[: self._num_positionals], self._args_schema) @@ -150,17 +150,17 @@ def forward(self, *args): class ModelInfoForExport: def __init__( self, - onnx_graph_input_names: List[str], - onnx_graph_input_names_require_grad: List[str], - onnx_graph_input_dynamic_axes_map: Dict[str, Dict[int, str]], - onnx_graph_input_shapes: List[List[int]], - onnx_graph_input_data_accessor_user_defined: Optional[Dict[str, callable]] = None, - onnx_graph_input_const_as_tensor: Optional[Dict[str, torch.device]] = None, - onnx_graph_input_arg_schema: Optional[Dict[str, ORTModelInputOutputSchemaType]] = None, - onnx_graph_input_kwarg_schema: Optional[Dict[str, ORTModelInputOutputSchemaType]] = None, + onnx_graph_input_names: list[str], + onnx_graph_input_names_require_grad: list[str], + onnx_graph_input_dynamic_axes_map: dict[str, dict[int, str]], + onnx_graph_input_shapes: list[list[int]], + onnx_graph_input_data_accessor_user_defined: dict[str, callable] | None = None, + onnx_graph_input_const_as_tensor: dict[str, torch.device] | None = None, + onnx_graph_input_arg_schema: dict[str, ORTModelInputOutputSchemaType] | None = None, + onnx_graph_input_kwarg_schema: dict[str, ORTModelInputOutputSchemaType] | None = None, num_positional_args: int = 0, - export_mode: Optional[int] = None, - export_extra_kwargs: Optional[Dict[str, any]] = None, + export_mode: int | None = None, + export_extra_kwargs: dict[str, any] | None = None, ): # Value can be either torch.onnx.TrainingMode.TRAINING or torch.onnx.TrainingMode.EVAL self.export_mode = export_mode @@ -172,41 +172,41 @@ def __init__( # Input names parsed and then flatten from the model's forward function signature. # This should contains ONLY the user defined input names # Be noted: some of the input might not be used by the model for its compute. - self.onnx_graph_input_names: List[str] = onnx_graph_input_names + self.onnx_graph_input_names: list[str] = onnx_graph_input_names # A subset of onnx_graph_input_names. # Input names that require gradient parsed and then flatten from the model's forward function signature # This should contains ONLY the user defined input names # Be noted: some of the input might not be used by the model for its compute. - self.onnx_graph_input_names_require_grad: List[str] = onnx_graph_input_names_require_grad + self.onnx_graph_input_names_require_grad: list[str] = onnx_graph_input_names_require_grad # Create symbolic names for each dimension of the graph input (e.g. onnx_graph_input_names). # The key is the input name, the value is a dict of {dim_index: symbolic_dim_name} # e.g. {"input1": {0: "input1_dim0", 1: "input1_dim1"}, "input2": {0: "input2_dim0"}} - self.onnx_graph_input_dynamic_axes_map: Dict[str, Dict[int, str]] = onnx_graph_input_dynamic_axes_map + self.onnx_graph_input_dynamic_axes_map: dict[str, dict[int, str]] = onnx_graph_input_dynamic_axes_map - self.onnx_graph_input_shapes: List[List[int]] = onnx_graph_input_shapes + self.onnx_graph_input_shapes: list[list[int]] = onnx_graph_input_shapes # The input args schema for the original model's forward function. # Only contains the schema for those inputs used by the model for its compute (e.g. as the inputs # of the export model). - self.onnx_graph_input_arg_schema: Dict[str, ORTModelInputOutputSchemaType] = onnx_graph_input_arg_schema + self.onnx_graph_input_arg_schema: dict[str, ORTModelInputOutputSchemaType] = onnx_graph_input_arg_schema # The input kwargs schema for the original model's forward function. # Only contains the schema for those inputs used by the model for its compute (e.g. as the inputs # of the export model). - self.onnx_graph_input_kwarg_schema: Dict[str, ORTModelInputOutputSchemaType] = onnx_graph_input_kwarg_schema + self.onnx_graph_input_kwarg_schema: dict[str, ORTModelInputOutputSchemaType] = onnx_graph_input_kwarg_schema self.num_positional_args: int = num_positional_args # A function to access the input data from the args and kwargs. # If it is not None, the length is same as onnx_graph_input_names. # For i-th input name, we can use the i-th function to get the input data from args and kwargs. - self.onnx_graph_input_data_accessor_user_defined: Optional[Dict[str, callable]] = ( + self.onnx_graph_input_data_accessor_user_defined: dict[str, callable] | None = ( onnx_graph_input_data_accessor_user_defined ) - self.onnx_graph_input_const_as_tensor: Optional[Dict[str, torch.device]] = onnx_graph_input_const_as_tensor + self.onnx_graph_input_const_as_tensor: dict[str, torch.device] | None = onnx_graph_input_const_as_tensor def __str__(self) -> str: return f"""ModelInfoForExport class: @@ -237,14 +237,14 @@ class SkipRetValue: def parse_inputs_for_onnx_export( - all_input_parameters: List[inspect.Parameter], + all_input_parameters: list[inspect.Parameter], args: Sequence[ORTModelInputOutputType], kwargs: Mapping[str, ORTModelInputOutputType], constant_as_tensor: bool, device: torch.device, export_mode: int, logger: Logger, - export_extra_kwargs: Optional[Dict[str, any]] = None, + export_extra_kwargs: dict[str, any] | None = None, ) -> ModelInfoForExport: """Parses through the model inputs and returns _InputInfo. @@ -275,7 +275,7 @@ def parse_inputs_for_onnx_export( arg_tensor_idx = [-1] kwarg_tensor_idx = [-1] - def _add_dynamic_shape(name, input) -> Dict[str, Dict[int, str]]: + def _add_dynamic_shape(name, input) -> dict[str, dict[int, str]]: dynamic_axes[name] = {} for dim_idx in range(len(input.shape)): dynamic_axes[name].update({dim_idx: f"{name}_dim{dim_idx}"}) @@ -285,7 +285,7 @@ def _warn_of_constant_inputs(data): logger.info(f"Received input of type {type(data)} is treated as a constant by ORT by default.") def _add_input( - name: str, input_value, onnx_graph_input_names: List[str], cur_func: Callable, tensor_idx: List[int] + name: str, input_value, onnx_graph_input_names: list[str], cur_func: Callable, tensor_idx: list[int] ): """Returns number of expanded non none inputs that _add_input processed""" @@ -396,16 +396,16 @@ def _access_func(key, cur_func, args, kwargs): raise ORTModuleIOError(f"ORTModule does not support input type {type(value)} for input {name}") - visited_input_names: List[str] = [] + visited_input_names: list[str] = [] - onnx_graph_input_names: List[str] = [] - dynamic_axes: Dict[str, Dict[int, str]] = {} - input_names_require_grad: List[str] = [] - input_shape: List[List[int]] = [] + onnx_graph_input_names: list[str] = [] + dynamic_axes: dict[str, dict[int, str]] = {} + input_names_require_grad: list[str] = [] + input_shape: list[list[int]] = [] input_arg_schema: ORTModelInputOutputSchemaType = [] input_kwarg_schema: ORTModelInputOutputSchemaType = OrderedDict() - data_accessors: Dict[str, Callable] = OrderedDict() - const_to_tensor_inputs: Dict[str, torch.device] = OrderedDict() + data_accessors: dict[str, Callable] = OrderedDict() + const_to_tensor_inputs: dict[str, torch.device] = OrderedDict() num_positional_args: int = 0 var_positional_idx = 0 @@ -511,7 +511,7 @@ def calculate_total_parameter_size_in_bytes(module: torch.nn.Module) -> int: return total_size -def can_module_be_deep_cloned(module: torch.nn.Module, device: Optional[torch.device]) -> bool: +def can_module_be_deep_cloned(module: torch.nn.Module, device: torch.device | None) -> bool: """Check if the module can be cloned If the 2 times total module parameter size >= device memory, the module cannot be cloned. @@ -568,8 +568,8 @@ def parse_outputs_for_onnx_export_and_extract_schema( sample_outputs = model_copy(*sample_args_copy, **sample_kwargs_copy) # Parse the output and extract the output_names and output_dynamic_axes to be used for onnx export - output_names: List[str] = [] - output_dynamic_axes: Dict[str, Dict[int, str]] = {} + output_names: list[str] = [] + output_dynamic_axes: dict[str, dict[int, str]] = {} for output_idx, output in enumerate(sample_outputs): output_name = f"output-{output_idx}" output_names.append(output_name) diff --git a/orttraining/orttraining/python/training/ortmodule/_logger.py b/orttraining/orttraining/python/training/ortmodule/_logger.py index 4d54e8e59fb50..00acae9061495 100644 --- a/orttraining/orttraining/python/training/ortmodule/_logger.py +++ b/orttraining/orttraining/python/training/ortmodule/_logger.py @@ -9,10 +9,10 @@ import tempfile import textwrap import time +from collections.abc import Callable from contextlib import contextmanager from enum import IntEnum from functools import partial -from typing import Callable, Dict, List, Optional from onnxruntime.capi._pybind_state import Severity @@ -28,7 +28,7 @@ class LogLevel(IntEnum): FATAL = 5 -ORTMODULE_LOG_LEVEL_MAP: Dict[LogLevel, List[int]] = { +ORTMODULE_LOG_LEVEL_MAP: dict[LogLevel, list[int]] = { LogLevel.VERBOSE: [Severity.VERBOSE, logging.DEBUG], LogLevel.DEVINFO: [Severity.INFO, logging.INFO], # ONNX Runtime has too many INFO logs, so we map it to WARNING for a better user experience. @@ -107,8 +107,8 @@ class TimeTracker: def __init__( self, ): - self.starts_: List[float] = [TimeTracker.NOT_RECORD] * len(ORTModuleInitPhase) - self.ends_: List[float] = [TimeTracker.NOT_RECORD] * len(ORTModuleInitPhase) + self.starts_: list[float] = [TimeTracker.NOT_RECORD] * len(ORTModuleInitPhase) + self.ends_: list[float] = [TimeTracker.NOT_RECORD] * len(ORTModuleInitPhase) def start(self, phase: ORTModuleInitPhase): self.starts_[phase] = time.time() @@ -184,7 +184,7 @@ def wrapper(*args, **kwargs): @contextmanager -def _suppress_os_stream_output(enable=True, on_exit: Optional[Callable] = None): +def _suppress_os_stream_output(enable=True, on_exit: Callable | None = None): """Suppress output from being printed to stdout and stderr. If on_exit is not None, it will be called when the context manager exits. @@ -224,7 +224,7 @@ def _suppress_os_stream_output(enable=True, on_exit: Optional[Callable] = None): yield -def _log_with_filter(logger: logging.Logger, record_filters: Optional[List[str]], name: Optional[str], fo): +def _log_with_filter(logger: logging.Logger, record_filters: list[str] | None, name: str | None, fo): """Log the content by filtering with list of string patterns. Args: logger: The logger to log the content. diff --git a/orttraining/orttraining/python/training/ortmodule/_onnx_models.py b/orttraining/orttraining/python/training/ortmodule/_onnx_models.py index 4b6011f0786ec..3f9262bc010c2 100644 --- a/orttraining/orttraining/python/training/ortmodule/_onnx_models.py +++ b/orttraining/orttraining/python/training/ortmodule/_onnx_models.py @@ -4,7 +4,6 @@ import os from dataclasses import dataclass -from typing import Optional import onnx import torch @@ -31,7 +30,7 @@ class ONNXModels: It has further optimizations done by the InferenceSession and is saved by the InferenceSession. """ - optimized_model: Optional[onnx.ModelProto] = None + optimized_model: onnx.ModelProto | None = None def save_optimized_model(self, path, name_prefix, export_mode): # save the ortmodule optimized model diff --git a/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py b/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py index c739283e5cafb..6026ecb861efa 100644 --- a/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py +++ b/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py @@ -7,7 +7,6 @@ import tempfile from enum import IntEnum from logging import Logger -from typing import Dict, List, Optional, Tuple, Union import onnx import torch @@ -66,8 +65,8 @@ class MemoryOptimizationSummary: def __init__(self, saving_str="", simplified_saving_expr=None, evaluated_saving=None, freq=0): self.raw_symbolic_saving_str = saving_str - self.simplified_symbolic_saving_expr: Optional[Symbol] = simplified_saving_expr - self.evaluated_saving: Union[str, int, None] = evaluated_saving + self.simplified_symbolic_saving_expr: Symbol | None = simplified_saving_expr + self.evaluated_saving: str | int | None = evaluated_saving self.freq = freq @@ -93,9 +92,9 @@ def __init__(self, m: torch.nn.Module, logger: Logger, training: bool): self._is_enabled = True # Memory optimization related. - self.cluster_id_combination_to_saving_symbolics_map: Dict[str, MemoryOptimizationSummary] = {} + self.cluster_id_combination_to_saving_symbolics_map: dict[str, MemoryOptimizationSummary] = {} ## The value is a list of symbolic dim values parsed from the first batch. - self.symbolic_dim_name_to_value_map: Dict = {} + self.symbolic_dim_name_to_value_map: dict = {} ## Used to control only the first batch is used to collect symbolic dim values. self.symbolic_dim_collecting_completed = False @@ -132,8 +131,8 @@ def enable_memory_stats_by_step(self, print_memory_stats_by_step: bool): def collect_symbolic_dim_values( self, - onnx_input_name_to_dynamic_axes_map: Dict[str, Dict[int, str]], - onnx_input_to_value_map: Dict[str, torch.Tensor], + onnx_input_name_to_dynamic_axes_map: dict[str, dict[int, str]], + onnx_input_to_value_map: dict[str, torch.Tensor], ): """Collect symbolic dim values.""" for input_name, dynamic_axes in onnx_input_name_to_dynamic_axes_map.items(): @@ -169,7 +168,7 @@ def find_memory_optimization_opportunity(self, execution_agent: TrainingAgent, r memory_optimizer_config_file_path, recompute_probe_config, False ) - cluster_id_to_saving_symbol_map: Dict[str, MemoryOptimizationSummary] = {} + cluster_id_to_saving_symbol_map: dict[str, MemoryOptimizationSummary] = {} for cluster_id, memory_saving_stat in memory_optimization_saving_symbolics.items(): memory_saving_symbolic = memory_saving_stat[0] freq = memory_saving_stat[1] @@ -282,7 +281,7 @@ def _increase_step(self): def display_memory_optimization_plans( self, memory_optimizer_config_file_path, details=False - ) -> Tuple[List[str], PTable]: + ) -> tuple[list[str], PTable]: mem_plan_count = len(self.cluster_id_combination_to_saving_symbolics_map) if mem_plan_count > 0: @@ -386,9 +385,9 @@ def backward(ctx, grad_output: torch.Tensor): @staticmethod def infer_shape( node: onnx.NodeProto, - tensor_input_shapes: List[Optional[List[Union[int, str]]]], - tensor_input_dtypes: List[torch.onnx.TensorProtoDataType], - ) -> Tuple[List[Optional[List[Union[int, str]]]], List[torch.onnx.TensorProtoDataType]]: + tensor_input_shapes: list[list[int | str] | None], + tensor_input_dtypes: list[torch.onnx.TensorProtoDataType], + ) -> tuple[list[list[int | str] | None], list[torch.onnx.TensorProtoDataType]]: return tensor_input_shapes, tensor_input_dtypes @staticmethod diff --git a/orttraining/orttraining/python/training/ortmodule/_torch_module_interface.py b/orttraining/orttraining/python/training/ortmodule/_torch_module_interface.py index 897bf89c15063..2ae3c98137cbd 100644 --- a/orttraining/orttraining/python/training/ortmodule/_torch_module_interface.py +++ b/orttraining/orttraining/python/training/ortmodule/_torch_module_interface.py @@ -3,7 +3,8 @@ # _torch_module_interface.py from collections import OrderedDict -from typing import Callable, Iterator, Optional, Tuple, TypeVar +from collections.abc import Callable, Iterator +from typing import Optional, TypeVar import torch @@ -58,10 +59,10 @@ def state_dict(self, destination=None, prefix="", keep_vars=False): def load_state_dict(self, state_dict: "OrderedDict[str, torch.Tensor]", strict: bool = True): raise NotImplementedError(f"load_state_dict is not implemented for {type(self)}.") - def register_buffer(self, name: str, tensor: Optional[torch.Tensor], persistent: bool = True) -> None: + def register_buffer(self, name: str, tensor: torch.Tensor | None, persistent: bool = True) -> None: raise NotImplementedError(f"register_buffer is not implemented for {type(self)}.") - def register_parameter(self, name: str, param: Optional[torch.nn.Parameter]) -> None: + def register_parameter(self, name: str, param: torch.nn.Parameter | None) -> None: raise NotImplementedError(f"register_parameter is not implemented for {type(self)}.") def get_parameter(self, target: str) -> torch.nn.Parameter: @@ -73,13 +74,13 @@ def get_buffer(self, target: str) -> torch.Tensor: def parameters(self, recurse: bool = True) -> Iterator[torch.nn.Parameter]: raise NotImplementedError(f"parameters is not implemented for {type(self)}.") - def named_parameters(self, prefix: str = "", recurse: bool = True) -> Iterator[Tuple[str, torch.nn.Parameter]]: + def named_parameters(self, prefix: str = "", recurse: bool = True) -> Iterator[tuple[str, torch.nn.Parameter]]: raise NotImplementedError(f"named_parameters is not implemented for {type(self)}.") def buffers(self, recurse: bool = True) -> Iterator[torch.Tensor]: raise NotImplementedError(f"buffers is not implemented for {type(self)}.") - def named_buffers(self, prefix: str = "", recurse: bool = True) -> Iterator[Tuple[str, torch.Tensor]]: + def named_buffers(self, prefix: str = "", recurse: bool = True) -> Iterator[tuple[str, torch.Tensor]]: raise NotImplementedError(f"named_buffers is not implemented for {type(self)}.") def _load_from_state_dict( @@ -87,7 +88,7 @@ def _load_from_state_dict( ): raise NotImplementedError(f"_load_from_state_dict is not implemented for {type(self)}.") - def named_children(self) -> Iterator[Tuple[str, T]]: + def named_children(self) -> Iterator[tuple[str, T]]: raise NotImplementedError(f"named_children is not implemented for {type(self)}.") def modules(self) -> Iterator[T]: diff --git a/orttraining/orttraining/python/training/ortmodule/_torch_module_ort.py b/orttraining/orttraining/python/training/ortmodule/_torch_module_ort.py index 125590902294d..2ed346fe0bfa6 100644 --- a/orttraining/orttraining/python/training/ortmodule/_torch_module_ort.py +++ b/orttraining/orttraining/python/training/ortmodule/_torch_module_ort.py @@ -3,8 +3,9 @@ # _torch_module_ort.py from collections import OrderedDict +from collections.abc import Callable, Iterator from logging import Logger -from typing import Callable, Iterator, Optional, Tuple, TypeVar +from typing import Optional, TypeVar import torch @@ -75,12 +76,12 @@ def load_state_dict(self, state_dict: "OrderedDict[str, torch.Tensor]", strict: # key names does not need to contain the _module.flattened_module._original_module prefix return self._original_module.load_state_dict(state_dict, strict=strict) - def register_buffer(self, name: str, tensor: Optional[torch.Tensor], persistent: bool = True) -> None: + def register_buffer(self, name: str, tensor: torch.Tensor | None, persistent: bool = True) -> None: """Override original method to delegate execution to the original PyTorch user module""" self._original_module.register_buffer(name, tensor, persistent=persistent) - def register_parameter(self, name: str, param: Optional[torch.nn.Parameter]) -> None: + def register_parameter(self, name: str, param: torch.nn.Parameter | None) -> None: """Override original method to delegate execution to the original PyTorch user module""" self._original_module.register_parameter(name, param) @@ -100,7 +101,7 @@ def parameters(self, recurse: bool = True) -> Iterator[torch.nn.Parameter]: yield from self._original_module.parameters(recurse=recurse) - def named_parameters(self, prefix: str = "", recurse: bool = True) -> Iterator[Tuple[str, torch.nn.Parameter]]: + def named_parameters(self, prefix: str = "", recurse: bool = True) -> Iterator[tuple[str, torch.nn.Parameter]]: """Override original method to delegate execution to the original PyTorch user module""" yield from self._original_module.named_parameters(prefix=prefix, recurse=recurse) @@ -110,7 +111,7 @@ def buffers(self, recurse: bool = True) -> Iterator[torch.Tensor]: yield from self._original_module.buffers(recurse=recurse) - def named_buffers(self, prefix: str = "", recurse: bool = True) -> Iterator[Tuple[str, torch.Tensor]]: + def named_buffers(self, prefix: str = "", recurse: bool = True) -> Iterator[tuple[str, torch.Tensor]]: """Override original method to delegate execution to the original PyTorch user module""" yield from self._original_module.named_buffers(prefix=prefix, recurse=recurse) @@ -129,7 +130,7 @@ def _load_from_state_dict( state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs ) - def named_children(self) -> Iterator[Tuple[str, T]]: + def named_children(self) -> Iterator[tuple[str, T]]: """Override original method to delegate execution to the original PyTorch user module""" yield from self._original_module.named_children() diff --git a/orttraining/orttraining/python/training/ortmodule/_torch_module_pytorch.py b/orttraining/orttraining/python/training/ortmodule/_torch_module_pytorch.py index 9f7fb1d0dcd16..2c38e98cc8657 100644 --- a/orttraining/orttraining/python/training/ortmodule/_torch_module_pytorch.py +++ b/orttraining/orttraining/python/training/ortmodule/_torch_module_pytorch.py @@ -3,7 +3,8 @@ # _torch_module_pytorch.py from collections import OrderedDict -from typing import Callable, Iterator, Optional, Tuple, TypeVar +from collections.abc import Callable, Iterator +from typing import Optional, TypeVar import torch @@ -38,10 +39,10 @@ def state_dict(self, destination=None, prefix="", keep_vars=False): def load_state_dict(self, state_dict: "OrderedDict[str, torch.Tensor]", strict: bool = True): return self._original_module.load_state_dict(state_dict, strict=strict) - def register_buffer(self, name: str, tensor: Optional[torch.Tensor], persistent: bool = True) -> None: + def register_buffer(self, name: str, tensor: torch.Tensor | None, persistent: bool = True) -> None: self._original_module.register_buffer(name, tensor, persistent=persistent) - def register_parameter(self, name: str, param: Optional[torch.nn.Parameter]) -> None: + def register_parameter(self, name: str, param: torch.nn.Parameter | None) -> None: self._original_module.register_parameter(name, param) def get_parameter(self, target: str) -> torch.nn.Parameter: @@ -53,13 +54,13 @@ def get_buffer(self, target: str) -> torch.Tensor: def parameters(self, recurse: bool = True) -> Iterator[torch.nn.Parameter]: yield from self._original_module.parameters(recurse=recurse) - def named_parameters(self, prefix: str = "", recurse: bool = True) -> Iterator[Tuple[str, torch.nn.Parameter]]: + def named_parameters(self, prefix: str = "", recurse: bool = True) -> Iterator[tuple[str, torch.nn.Parameter]]: yield from self._original_module.named_parameters(prefix=prefix, recurse=recurse) def buffers(self, recurse: bool = True) -> Iterator[torch.Tensor]: yield from self._original_module.buffers(recurse=recurse) - def named_buffers(self, prefix: str = "", recurse: bool = True) -> Iterator[Tuple[str, torch.Tensor]]: + def named_buffers(self, prefix: str = "", recurse: bool = True) -> Iterator[tuple[str, torch.Tensor]]: yield from self._original_module.named_buffers(prefix=prefix, recurse=recurse) def _load_from_state_dict( @@ -69,7 +70,7 @@ def _load_from_state_dict( state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs ) - def named_children(self) -> Iterator[Tuple[str, T]]: + def named_children(self) -> Iterator[tuple[str, T]]: yield from self._original_module.named_children() def modules(self) -> Iterator[T]: diff --git a/orttraining/orttraining/python/training/ortmodule/_training_manager.py b/orttraining/orttraining/python/training/ortmodule/_training_manager.py index d5d5ce672224c..b4303587e69e6 100644 --- a/orttraining/orttraining/python/training/ortmodule/_training_manager.py +++ b/orttraining/orttraining/python/training/ortmodule/_training_manager.py @@ -4,7 +4,6 @@ # -------------------------------------------------------------------------- from logging import Logger -from typing import Tuple import onnx import torch @@ -48,7 +47,7 @@ def execution_session_run_forward( device: torch.device, gradient_accumulation_manager: GradientAccumulationManager, *inputs, - ) -> Tuple[Tuple[torch.Tensor, ...], _RunStateInfo]: + ) -> tuple[tuple[torch.Tensor, ...], _RunStateInfo]: """Runs the forward pass on `execution_session` with given `onnx_model`, `device` and `inputs` Args: @@ -85,7 +84,7 @@ def execution_session_run_forward( # Run and return module outputs. execution_session.run_forward(forward_inputs, forward_outputs, state, gradient_accumulation_manager.cache) - user_outputs: Tuple[torch.Tensor, ...] = gradient_accumulation_manager.extract_outputs_and_maybe_update_cache( + user_outputs: tuple[torch.Tensor, ...] = gradient_accumulation_manager.extract_outputs_and_maybe_update_cache( forward_outputs, device ) diff --git a/orttraining/orttraining/python/training/ortmodule/_utils.py b/orttraining/orttraining/python/training/ortmodule/_utils.py index 4787cb31a24fd..2e115654e4c96 100644 --- a/orttraining/orttraining/python/training/ortmodule/_utils.py +++ b/orttraining/orttraining/python/training/ortmodule/_utils.py @@ -13,7 +13,7 @@ import random import traceback import types -from typing import Callable, List, Optional, Tuple, Union +from collections.abc import Callable import numpy as np import torch @@ -63,8 +63,8 @@ def _ortvalue_from_torch_tensor(torch_tensor: torch.Tensor) -> C.OrtValue: def _ortvalues_to_torch_tensor( - ortvalues: C.OrtValueVector, device: Optional[torch.device] = None -) -> Tuple[torch.Tensor, ...]: + ortvalues: C.OrtValueVector, device: torch.device | None = None +) -> tuple[torch.Tensor, ...]: if len(ortvalues) == 0: return tuple() @@ -76,7 +76,7 @@ def _ortvalues_to_torch_tensor( if not isinstance(ortvalues, C.OrtValueVector): raise TypeError(f"ortvalues must be an instance of OrtValueVector not {type(ortvalues)!r}.") - res: List[torch.Tensor] = ortvalues.to_dlpacks(_from_dlpack) + res: list[torch.Tensor] = ortvalues.to_dlpacks(_from_dlpack) bool_indices = ortvalues.bool_tensor_indices() if len(bool_indices): # DLPack structure does not know for sure if it stores boolean @@ -127,7 +127,7 @@ def _check_same_device(device: torch.device, argument_str: str, *args): ) -def get_device_index(device: Union[str, int, torch.device]) -> int: +def get_device_index(device: str | int | torch.device) -> int: if isinstance(device, str): # could be 'cuda:0', 'cuda:1', or 'cpu'. with cpu, set index=0 device = torch.device(device) @@ -136,7 +136,7 @@ def get_device_index(device: Union[str, int, torch.device]) -> int: return 0 if device.index is None else device.index -def get_device_str(device: Union[str, int, torch.device]) -> str: +def get_device_str(device: str | int | torch.device) -> str: if isinstance(device, str): # could be 'cuda:0', 'cuda:1', or 'cpu'. with cpu, set index=0 if device.find(":") == -1: @@ -161,7 +161,7 @@ def get_device_from_module_and_inputs(module, inputs, kwargs): return device -def _get_device_from_module(module) -> Optional[torch.device]: +def _get_device_from_module(module) -> torch.device | None: """Returns the first device found in the `module`'s parameters or None Args: @@ -187,7 +187,7 @@ def _get_device_from_module(module) -> Optional[torch.device]: return device -def _get_device_from_inputs(args, kwargs) -> Optional[torch.device]: +def _get_device_from_inputs(args, kwargs) -> torch.device | None: """Returns device from first PyTorch Tensor within args or kwargs Args: diff --git a/orttraining/orttraining/python/training/ortmodule/_zero_stage3_compatibility.py b/orttraining/orttraining/python/training/ortmodule/_zero_stage3_compatibility.py index 7da3e18007447..75601d0c828b8 100644 --- a/orttraining/orttraining/python/training/ortmodule/_zero_stage3_compatibility.py +++ b/orttraining/orttraining/python/training/ortmodule/_zero_stage3_compatibility.py @@ -5,7 +5,6 @@ from contextlib import contextmanager -from typing import Dict, List, Optional, Tuple, Union import torch from onnx import ModelProto, NodeProto, TensorProto, ValueInfoProto, helper @@ -31,8 +30,8 @@ def post_processing_enable_zero_stage3_compat( exported_model: ModelProto, - zero_stage3_named_params: Dict[str, torch.nn.parameter.Parameter], - all_param_names: List[str], + zero_stage3_named_params: dict[str, torch.nn.parameter.Parameter], + all_param_names: list[str], ) -> ModelProto: """This function is used to enable zero stage3 compatibility. @@ -62,7 +61,7 @@ def post_processing_enable_zero_stage3_compat( def _get_param_pull_trigger_name(param_name: str) -> str: return f"pull_{param_name}" - def _get_func_name(node: NodeProto) -> Optional[str]: + def _get_func_name(node: NodeProto) -> str | None: for attr in node.attribute: if attr.name == "func_name": return attr.s.decode("utf-8") if isinstance(attr.s, bytes) else attr.s @@ -210,7 +209,7 @@ def _get_func_name(node: NodeProto) -> Optional[str]: def _create_weight_retrieval_function( - zero_stage3_named_params: Optional[Dict[str, torch.nn.parameter.Parameter]], + zero_stage3_named_params: dict[str, torch.nn.parameter.Parameter] | None, ) -> str: """This function is used to create a weight retrieving function using zero_stage3_named_params.""" @@ -231,9 +230,9 @@ def backward(ctx, *grad_outputs): @staticmethod def infer_shape( node: NodeProto, - tensor_input_shapes: List[Optional[List[Union[int, str]]]], - tensor_input_dtypes: List[torch.onnx.TensorProtoDataType], - ) -> Tuple[List[Optional[List[Union[int, str]]]], List[torch.onnx.TensorProtoDataType]]: + tensor_input_shapes: list[list[int | str] | None], + tensor_input_dtypes: list[torch.onnx.TensorProtoDataType], + ) -> tuple[list[list[int | str] | None], list[torch.onnx.TensorProtoDataType]]: param_count = len(zero_stage3_named_params.values()) tensor_output_shapes = [ tensor_input_shapes[0], @@ -258,9 +257,9 @@ def _register_symbolic_shape_infer_functions(): def _simple_pass_through_infer_shape( node: NodeProto, - tensor_input_shapes: List[Optional[List[Union[int, str]]]], - tensor_input_dtypes: List[torch.onnx.TensorProtoDataType], - ) -> Tuple[List[Optional[List[Union[int, str]]]], List[torch.onnx.TensorProtoDataType]]: + tensor_input_shapes: list[list[int | str] | None], + tensor_input_dtypes: list[torch.onnx.TensorProtoDataType], + ) -> tuple[list[list[int | str] | None], list[torch.onnx.TensorProtoDataType]]: return tensor_input_shapes, tensor_input_dtypes register_shape_inference_function(DEEPSPEED_PRE_BACKWARD_FUNCTION_NAME, _simple_pass_through_infer_shape) @@ -268,9 +267,9 @@ def _simple_pass_through_infer_shape( def _linear_infer_shape( node: NodeProto, - tensor_input_shapes: List[Optional[List[Union[int, str]]]], - tensor_input_dtypes: List[torch.onnx.TensorProtoDataType], - ) -> Tuple[List[Optional[List[Union[int, str]]]], List[torch.onnx.TensorProtoDataType]]: + tensor_input_shapes: list[list[int | str] | None], + tensor_input_dtypes: list[torch.onnx.TensorProtoDataType], + ) -> tuple[list[list[int | str] | None], list[torch.onnx.TensorProtoDataType]]: # output = input.matmul(weight.t()) tensor_input_shapes[0] # input shape2 = tensor_input_shapes[1] # weight @@ -311,13 +310,13 @@ def _alias_input(node_proto_str: str): def _create_weight_retrieval_pythonop( - zero_stage3_named_params: Optional[Dict[str, torch.nn.parameter.Parameter]], + zero_stage3_named_params: dict[str, torch.nn.parameter.Parameter] | None, func_full_qual_name: str, input_name: str, - output_names: List[str], + output_names: list[str], pull_weight_trigger_output_dtype: int, - pull_weight_trigger_output_shape: List[int], -) -> Tuple[ValueInfoProto, NodeProto]: + pull_weight_trigger_output_shape: list[int], +) -> tuple[ValueInfoProto, NodeProto]: """This function is used to create a weight retrieving PythonOp.""" offload_param_count = 0 if zero_stage3_named_params is None else len(zero_stage3_named_params) new_input = helper.make_tensor_value_info( @@ -417,7 +416,7 @@ def stage3_export_context(enable: bool, stage3_param_handle, flattened_module): from torch.onnx._internal import _beartype @_beartype.beartype - def _get_tensor_rank(x) -> Optional[int]: + def _get_tensor_rank(x) -> int | None: ### Adapted from https://github.com/pytorch/pytorch/blob/185515368bcd7d94ac06ab1634f22b747b03c6d9/torch/onnx/symbolic_helper.py#L561 # Retrieve the real rank for the stage3 weights, because stage3 weights are all (0). from typing import cast as typing_cast diff --git a/orttraining/orttraining/python/training/ortmodule/graph_optimizer_registry.py b/orttraining/orttraining/python/training/ortmodule/graph_optimizer_registry.py index 897ecac148bfb..fa4c6dd04d81b 100644 --- a/orttraining/orttraining/python/training/ortmodule/graph_optimizer_registry.py +++ b/orttraining/orttraining/python/training/ortmodule/graph_optimizer_registry.py @@ -3,7 +3,7 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- -from typing import Callable +from collections.abc import Callable from onnx.onnx_ml_pb2 import GraphProto diff --git a/orttraining/orttraining/python/training/ortmodule/graph_optimizers/_aten_attn.py b/orttraining/orttraining/python/training/ortmodule/graph_optimizers/_aten_attn.py index c1fb6e68568f5..b5e5ae45f3631 100644 --- a/orttraining/orttraining/python/training/ortmodule/graph_optimizers/_aten_attn.py +++ b/orttraining/orttraining/python/training/ortmodule/graph_optimizers/_aten_attn.py @@ -15,8 +15,6 @@ support if we want to try in the future. """ -from typing import List, Tuple - from onnx import GraphProto, NodeProto, TensorProto, helper from ..graph_optimizer_registry import register_graph_optimizer @@ -125,7 +123,7 @@ def _make_efficient_attention_nodes( # Without causal mask, with Dropout. For example, BERT model in HuggingFace. -_PATTERN_0: List[Tuple[str, bool, List[Tuple[int, int, int]]]] = [ +_PATTERN_0: list[tuple[str, bool, list[tuple[int, int, int]]]] = [ ("MatMul", False, []), # 0 ("Transpose", True, [(0, 0, 0)]), # 1 ("Transpose", True, [(0, 0, 1)]), # 2 @@ -152,7 +150,7 @@ def _make_efficient_attention_nodes( ] -def _optimize_for_pattern_0(matcher: GraphMatcher, idx: int, nodes: List[NodeProto]): +def _optimize_for_pattern_0(matcher: GraphMatcher, idx: int, nodes: list[NodeProto]): # Check forward only as the backward is expected to be consistent if it's built correctly. scale_value = matcher.get_constant_value(nodes[3].input[1]) ratio_value = matcher.get_constant_value(nodes[6].input[1]) @@ -188,7 +186,7 @@ def _optimize_for_pattern_0(matcher: GraphMatcher, idx: int, nodes: List[NodePro # Without causal mask, without Dropout. For example, BERT model and disabling attention dropout in HuggingFace. -_PATTERN_1: List[Tuple[str, bool, List[Tuple[int, int, int]]]] = [ +_PATTERN_1: list[tuple[str, bool, list[tuple[int, int, int]]]] = [ ("MatMul", False, []), # 0 ("Transpose", True, [(0, 0, 0)]), # 1 ("Transpose", True, [(0, 0, 1)]), # 2 @@ -213,7 +211,7 @@ def _optimize_for_pattern_0(matcher: GraphMatcher, idx: int, nodes: List[NodePro ] -def _optimize_for_pattern_1(matcher: GraphMatcher, idx: int, nodes: List[NodeProto]): +def _optimize_for_pattern_1(matcher: GraphMatcher, idx: int, nodes: list[NodeProto]): # Check forward only as the backward is expected to be consistent if it's built correctly. scale_value = matcher.get_constant_value(nodes[3].input[1]) if not ( diff --git a/orttraining/orttraining/python/training/ortmodule/graph_optimizers/utils.py b/orttraining/orttraining/python/training/ortmodule/graph_optimizers/utils.py index fbd98675aebe6..9089004559923 100644 --- a/orttraining/orttraining/python/training/ortmodule/graph_optimizers/utils.py +++ b/orttraining/orttraining/python/training/ortmodule/graph_optimizers/utils.py @@ -4,7 +4,8 @@ # -------------------------------------------------------------------------- import itertools -from typing import Any, Dict, List, Sequence, Tuple +from collections.abc import Sequence +from typing import Any import numpy as np from onnx import GraphProto, NodeProto, TensorProto, helper, numpy_helper @@ -54,8 +55,8 @@ class GraphMatcher: def __init__(self, graph: GraphProto): self._graph: GraphProto = graph - self._op_type_to_nodes: Dict[str, List[NodeProto]] = {} - self._consumer_count: Dict[str, int] = {} + self._op_type_to_nodes: dict[str, list[NodeProto]] = {} + self._consumer_count: dict[str, int] = {} for node in graph.node: if node.op_type not in self._op_type_to_nodes: self._op_type_to_nodes[node.op_type] = [] @@ -117,7 +118,7 @@ def get_type_and_shape(self, arg: str): return initializers[0].data_type, initializers[0].dims return None, None - def _match_pattern(self, node: NodeProto, pattern: List[Tuple[str, bool, List[Tuple[int, int, int]]]]): + def _match_pattern(self, node: NodeProto, pattern: list[tuple[str, bool, list[tuple[int, int, int]]]]): nodes = [node] for i in range(1, len(pattern)): next_op_type = pattern[i][0] @@ -140,7 +141,7 @@ def _match_pattern(self, node: NodeProto, pattern: List[Tuple[str, bool, List[Tu nodes.append(next_node) return nodes - def match_pattern(self, pattern: List[Tuple[str, bool, List[Tuple[int, int, int]]]]): + def match_pattern(self, pattern: list[tuple[str, bool, list[tuple[int, int, int]]]]): for node in self._op_type_to_nodes.get(pattern[0][0], []): result = self._match_pattern(node, pattern) if len(result) == len(pattern): @@ -165,9 +166,9 @@ def make_constant_node(name: str, dtype: TensorProto.DataType, dims: Sequence[in def update_graph( graph: GraphProto, - nodes_to_remove: List[NodeProto], - nodes_to_add: List[NodeProto], - new_value_infos: List[TensorProto] = [], # noqa: B006 + nodes_to_remove: list[NodeProto], + nodes_to_add: list[NodeProto], + new_value_infos: list[TensorProto] = [], # noqa: B006 ): """Update an ONNX graph by removing some nodes, and adding some new nodes and value infos.""" nodes = [node for node in graph.node if node not in nodes_to_remove] diff --git a/orttraining/orttraining/python/training/ortmodule/ortmodule.py b/orttraining/orttraining/python/training/ortmodule/ortmodule.py index b291bfb2ba03c..a7942eea5be26 100644 --- a/orttraining/orttraining/python/training/ortmodule/ortmodule.py +++ b/orttraining/orttraining/python/training/ortmodule/ortmodule.py @@ -18,7 +18,9 @@ from onnxruntime.tools import pytorch_export_contrib_ops import torch -from typing import Iterator, Optional, OrderedDict, Tuple, TypeVar, Callable +from typing import TypeVar +from collections import OrderedDict +from collections.abc import Iterator, Callable # Needed to override PyTorch methods T = TypeVar("T", bound="torch.nn.Module") @@ -35,7 +37,7 @@ class ORTModule(torch.nn.Module): debug_options (:obj:`DebugOptions`, optional): debugging options for ORTModule. """ - def __init__(self, module: torch.nn.Module, debug_options: Optional[DebugOptions] = None): + def __init__(self, module: torch.nn.Module, debug_options: DebugOptions | None = None): # NOTE: torch.nn.Modules that call setattr on their internal attributes regularly # (for example PyTorch Lightning), will trigger regular re-exports. This is # because ORTModule auto detects such setattrs on the original module and @@ -154,7 +156,7 @@ def _replicate_for_data_parallel(self): return self._torch_module._replicate_for_data_parallel() - def add_module(self, name: str, module: Optional[torch.nn.Module]) -> None: + def add_module(self, name: str, module: torch.nn.Module | None) -> None: """Raises a ORTModuleTorchModelException exception since ORTModule does not support adding modules to it""" self._torch_module.add_module(name, module) @@ -217,12 +219,12 @@ def load_state_dict(self, state_dict: "OrderedDict[str, torch.Tensor]", strict: return self._torch_module.load_state_dict(state_dict, strict=strict) - def register_buffer(self, name: str, tensor: Optional[torch.Tensor], persistent: bool = True) -> None: + def register_buffer(self, name: str, tensor: torch.Tensor | None, persistent: bool = True) -> None: """Override :meth:`~torch.nn.Module.register_buffer`""" self._torch_module.register_buffer(name, tensor, persistent=persistent) - def register_parameter(self, name: str, param: Optional[torch.nn.Parameter]) -> None: + def register_parameter(self, name: str, param: torch.nn.Parameter | None) -> None: """Override :meth:`~torch.nn.Module.register_parameter`""" self._torch_module.register_parameter(name, param) @@ -242,7 +244,7 @@ def parameters(self, recurse: bool = True) -> Iterator[torch.nn.Parameter]: yield from self._torch_module.parameters(recurse=recurse) - def named_parameters(self, prefix: str = "", recurse: bool = True) -> Iterator[Tuple[str, torch.nn.Parameter]]: + def named_parameters(self, prefix: str = "", recurse: bool = True) -> Iterator[tuple[str, torch.nn.Parameter]]: """Override :meth:`~torch.nn.Module.named_parameters`""" yield from self._torch_module.named_parameters(prefix=prefix, recurse=recurse) @@ -252,7 +254,7 @@ def buffers(self, recurse: bool = True) -> Iterator[torch.Tensor]: yield from self._torch_module.buffers(recurse=recurse) - def named_buffers(self, prefix: str = "", recurse: bool = True) -> Iterator[Tuple[str, torch.Tensor]]: + def named_buffers(self, prefix: str = "", recurse: bool = True) -> Iterator[tuple[str, torch.Tensor]]: """Override :meth:`~torch.nn.Module.named_buffers`""" yield from self._torch_module.named_buffers(prefix=prefix, recurse=recurse) @@ -266,7 +268,7 @@ def _load_from_state_dict( state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs ) - def named_children(self) -> Iterator[Tuple[str, torch.nn.Module]]: + def named_children(self) -> Iterator[tuple[str, torch.nn.Module]]: """Override :meth:`~torch.nn.Module.named_children`""" yield from self._torch_module.named_children() diff --git a/orttraining/orttraining/python/training/utils/data/sampler.py b/orttraining/orttraining/python/training/utils/data/sampler.py index afc4d360b1582..8dfe576714609 100644 --- a/orttraining/orttraining/python/training/utils/data/sampler.py +++ b/orttraining/orttraining/python/training/utils/data/sampler.py @@ -3,7 +3,7 @@ # sampler.py import math -from typing import Callable, Iterator, Optional +from collections.abc import Callable, Iterator import numpy as np import torch @@ -106,10 +106,10 @@ def __init__( self, dataset: Dataset, complexity_fn: Callable[..., int], - world_size: Optional[int] = None, - rank: Optional[int] = None, + world_size: int | None = None, + rank: int | None = None, shuffle: bool = True, - group_size: Optional[int] = None, + group_size: int | None = None, seed: int = 0, drop_last: bool = False, random_level: float = 0, diff --git a/orttraining/orttraining/python/training/utils/hooks/_statistics_subscriber.py b/orttraining/orttraining/python/training/utils/hooks/_statistics_subscriber.py index d7ea3dc419114..d466faddf91bc 100644 --- a/orttraining/orttraining/python/training/utils/hooks/_statistics_subscriber.py +++ b/orttraining/orttraining/python/training/utils/hooks/_statistics_subscriber.py @@ -8,7 +8,6 @@ import warnings from io import TextIOWrapper from pathlib import Path -from typing import List, Optional, Tuple, Union import onnx import torch @@ -29,7 +28,7 @@ class _InspectActivation(torch.autograd.Function): def forward( ctx, activation_name: str, - module_idx: Optional[int], + module_idx: int | None, run_ctx: RuntimeStates, input_tensor: torch.Tensor, module_post_forward, @@ -89,9 +88,9 @@ def backward(ctx, grad_output: torch.Tensor): @staticmethod def infer_shape( node: onnx.NodeProto, - tensor_input_shapes: List[Optional[List[Union[int, str]]]], - tensor_input_dtypes: List[torch.onnx.TensorProtoDataType], - ) -> Tuple[List[Optional[List[Union[int, str]]]], List[torch.onnx.TensorProtoDataType]]: + tensor_input_shapes: list[list[int | str] | None], + tensor_input_dtypes: list[torch.onnx.TensorProtoDataType], + ) -> tuple[list[list[int | str] | None], list[torch.onnx.TensorProtoDataType]]: return tensor_input_shapes, tensor_input_dtypes @staticmethod @@ -124,8 +123,8 @@ class StatisticsSubscriber(SubscriberBase): def __init__( self, output_dir: str, - start_step: Union[None, int] = None, - end_step: Union[None, int] = None, + start_step: None | int = None, + end_step: None | int = None, override_output_dir: bool = False, run_on_cpu: bool = False, bucket_size: int = 1024 * 1024 * 1024 // 2, diff --git a/orttraining/orttraining/python/training/utils/hooks/_subscriber_base.py b/orttraining/orttraining/python/training/utils/hooks/_subscriber_base.py index 1b9a6fc91ec3c..05c58b86b993f 100644 --- a/orttraining/orttraining/python/training/utils/hooks/_subscriber_base.py +++ b/orttraining/orttraining/python/training/utils/hooks/_subscriber_base.py @@ -5,7 +5,6 @@ import sys -from typing import Optional, Tuple import torch @@ -52,7 +51,7 @@ class SubscriberBase: With this, the overall flow can be traced as a data flow graph (DAG). """ - def __init__(self, start_step: Optional[int], end_step: Optional[int]): + def __init__(self, start_step: int | None, end_step: int | None): """ Steps in [start_step, end_step) will run the subscriber's actions, and other steps will skip. If start_step is None, 0 is given; if end_step is None, sys.maxsize is given. @@ -66,7 +65,7 @@ def pre_forward_module_apply( module: torch.nn.Module, args: ORTModelInputOutputType, kwargs: ORTModelInputOutputType, - ) -> Tuple[ORTModelInputOutputType, ORTModelInputOutputType]: + ) -> tuple[ORTModelInputOutputType, ORTModelInputOutputType]: """This function is called inside the nn.Module's pre-forward hook. Args: @@ -91,7 +90,7 @@ def pre_forward_module_apply_impl( module: torch.nn.Module, args: ORTModelInputOutputType, kwargs: ORTModelInputOutputType, - ) -> Tuple[ORTModelInputOutputType, ORTModelInputOutputType]: + ) -> tuple[ORTModelInputOutputType, ORTModelInputOutputType]: return args, kwargs def pre_forward_tensor_apply( @@ -121,7 +120,7 @@ def post_forward_module_apply( module: torch.nn.Module, args: ORTModelInputOutputType, outputs: ORTModelInputOutputType, - ) -> Tuple[ORTModelInputOutputType, ORTModelInputOutputType]: + ) -> tuple[ORTModelInputOutputType, ORTModelInputOutputType]: """This function is called inside the nn.Module's post-forward hook. Args: @@ -146,7 +145,7 @@ def post_forward_module_apply_impl( module: torch.nn.Module, args: ORTModelInputOutputType, outputs: ORTModelInputOutputType, - ) -> Tuple[ORTModelInputOutputType, ORTModelInputOutputType]: + ) -> tuple[ORTModelInputOutputType, ORTModelInputOutputType]: return args, outputs def post_forward_tensor_apply( @@ -179,7 +178,7 @@ def post_forward_outmost_module_apply( module: torch.nn.Module, args: ORTModelInputOutputType, outputs: ORTModelInputOutputType, - ) -> Tuple[ORTModelInputOutputType, ORTModelInputOutputType]: + ) -> tuple[ORTModelInputOutputType, ORTModelInputOutputType]: """This function is called inside the outmost nn.Module's post-forward hook. Args: @@ -204,7 +203,7 @@ def post_forward_outmost_module_apply_impl( module: torch.nn.Module, args: ORTModelInputOutputType, outputs: ORTModelInputOutputType, - ) -> Tuple[ORTModelInputOutputType, ORTModelInputOutputType]: + ) -> tuple[ORTModelInputOutputType, ORTModelInputOutputType]: return args, outputs def _need_skip_step(self, current_step: int) -> bool: diff --git a/orttraining/orttraining/python/training/utils/hooks/_subscriber_manager.py b/orttraining/orttraining/python/training/utils/hooks/_subscriber_manager.py index c9c06dabab4de..c41f5078b20d7 100644 --- a/orttraining/orttraining/python/training/utils/hooks/_subscriber_manager.py +++ b/orttraining/orttraining/python/training/utils/hooks/_subscriber_manager.py @@ -6,7 +6,6 @@ import inspect from contextlib import contextmanager -from typing import List, Optional, Set, Tuple, Union import onnx import torch @@ -40,7 +39,7 @@ class _IncrementStep(torch.autograd.Function): """ @staticmethod - def forward(ctx, run_ctx: RuntimeStates, *input_tensor_list: Tuple[torch.Tensor, ...]) -> Tuple[torch.Tensor, ...]: + def forward(ctx, run_ctx: RuntimeStates, *input_tensor_list: tuple[torch.Tensor, ...]) -> tuple[torch.Tensor, ...]: """Make sure there is the same number of `tensor` inputs and outputs. This is enforced by ORT's PythonOp's schema check. """ @@ -57,15 +56,15 @@ def forward(ctx, run_ctx: RuntimeStates, *input_tensor_list: Tuple[torch.Tensor, return tuple(t.detach().requires_grad_(t.requires_grad) for t in input_tensor_list) @staticmethod - def backward(ctx, *grad_output: Tuple[Optional[torch.Tensor], ...]) -> Tuple[Optional[torch.Tensor], ...]: + def backward(ctx, *grad_output: tuple[torch.Tensor | None, ...]) -> tuple[torch.Tensor | None, ...]: return (None, *tuple(g for g in grad_output)) @staticmethod def infer_shape( node: onnx.NodeProto, - tensor_input_shapes: List[Optional[List[Union[int, str]]]], - tensor_input_dtypes: List[torch.onnx.TensorProtoDataType], - ) -> Tuple[List[Optional[List[Union[int, str]]]], List[torch.onnx.TensorProtoDataType]]: + tensor_input_shapes: list[list[int | str] | None], + tensor_input_dtypes: list[torch.onnx.TensorProtoDataType], + ) -> tuple[list[list[int | str] | None], list[torch.onnx.TensorProtoDataType]]: return tensor_input_shapes, tensor_input_dtypes @staticmethod @@ -104,11 +103,11 @@ class SubscriberManager: def __init__(self): self._run_ctx = RuntimeStates() - self._subscribers: Set[SubscriberBase] = set() + self._subscribers: set[SubscriberBase] = set() self._pre_forward_hooks = [] self._post_forward_hooks = [] - def subscribe(self, module: torch.nn.Module, subscribers: List[SubscriberBase]): + def subscribe(self, module: torch.nn.Module, subscribers: list[SubscriberBase]): """ The API is called externally to register hooks that are implicitly defined by subscribers. Each time all global states will be cleaned up once called. @@ -192,7 +191,7 @@ def _post_forward_outmost_module_hook(module, module_inputs, module_outputs): module.register_forward_hook(_post_forward_outmost_module_hook) def _initialize_one_time_global_states(self, module: torch.nn.Module): - def _reset_recursively(module: torch.nn.Module, depth: int, next_module_index: List[int]): + def _reset_recursively(module: torch.nn.Module, depth: int, next_module_index: list[int]): """ Called to register hooks for every `torch.nn.Module`. Due to `Module` can contain child `Module`s, this function is called recursively by passing in `next_module_index` - a list of int to maintain a @@ -219,7 +218,7 @@ def _reset_recursively(module: torch.nn.Module, depth: int, next_module_index: L next_module_index = [0] _reset_recursively(module, 1, next_module_index) - def _register_hooks_recursively(self, module: torch.nn.Module, depth: int, next_module_index: List[int]): + def _register_hooks_recursively(self, module: torch.nn.Module, depth: int, next_module_index: list[int]): """Register hooks for every `torch.nn.Module`. Due to `Module` can contain child `Module`s, this function is called recursively by passing in `next_module_index` - a list of int to maintain a global incremental unique module id. diff --git a/orttraining/orttraining/python/training/utils/hooks/_zero_offload_subscriber.py b/orttraining/orttraining/python/training/utils/hooks/_zero_offload_subscriber.py index d4b9768116e92..57078222a22e7 100644 --- a/orttraining/orttraining/python/training/utils/hooks/_zero_offload_subscriber.py +++ b/orttraining/orttraining/python/training/utils/hooks/_zero_offload_subscriber.py @@ -7,9 +7,10 @@ import inspect import warnings from collections import OrderedDict +from collections.abc import Callable from datetime import timedelta from types import CodeType, FunctionType -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any import onnx import torch @@ -80,7 +81,7 @@ def source_rank(self) -> int: def _source_rank(self) -> int: return 0 - def result(self) -> List[torch.Tensor]: + def result(self) -> list[torch.Tensor]: return [] def synchronize(self): @@ -177,7 +178,7 @@ def configure_ort_compatible_zero_stage3(debug=False, stats_output_dir=None, sta @nvtx_function_decorator -def _get_params_for_current_module(module: torch.nn.Module) -> List[torch.nn.parameter.Parameter]: +def _get_params_for_current_module(module: torch.nn.Module) -> list[torch.nn.parameter.Parameter]: """Retrieve the parameters for this module. Logic adapted from @@ -192,7 +193,7 @@ def _get_params_for_current_module(module: torch.nn.Module) -> List[torch.nn.par @nvtx_function_decorator -def _get_all_zero_stage3_params(module: torch.nn.Module) -> Dict[str, torch.nn.parameter.Parameter]: +def _get_all_zero_stage3_params(module: torch.nn.Module) -> dict[str, torch.nn.parameter.Parameter]: """Retrieve all the parameters that are offloaded.""" from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus @@ -205,7 +206,7 @@ def _get_all_zero_stage3_params(module: torch.nn.Module) -> Dict[str, torch.nn.p # Used to cache the map avoid repeated loop up (X us) overhead during training. -_ModuleToParametersRefs: Dict[torch.nn.Module, List[torch.nn.parameter.Parameter]] = OrderedDict() +_ModuleToParametersRefs: dict[torch.nn.Module, list[torch.nn.parameter.Parameter]] = OrderedDict() class ORTZeROOffloadPreForwardFunction(torch.autograd.Function): @@ -295,7 +296,7 @@ def backward(ctx, *grads): # completing the full backward propagation, will not affect parameter updates. passed_in_param_grad = [ torch.zeros(shape, dtype=dtype, device=device) - for shape, dtype, device in zip(ctx.shapes, ctx.dtypes, ctx.devices) + for shape, dtype, device in zip(ctx.shapes, ctx.dtypes, ctx.devices, strict=False) ] zero_grads = updated_grads[:input_count] + tuple(passed_in_param_grad) @@ -306,9 +307,9 @@ def backward(ctx, *grads): @staticmethod def infer_shape( node: onnx.NodeProto, - tensor_input_shapes: List[Optional[List[Union[int, str]]]], - tensor_input_dtypes: List[torch.onnx.TensorProtoDataType], - ) -> Tuple[List[Optional[List[Union[int, str]]]], List[torch.onnx.TensorProtoDataType]]: + tensor_input_shapes: list[list[int | str] | None], + tensor_input_dtypes: list[torch.onnx.TensorProtoDataType], + ) -> tuple[list[list[int | str] | None], list[torch.onnx.TensorProtoDataType]]: input_pointer_scalars_attr_name = "input_pointer_scalars" found = [attr for attr in node.attribute if attr.name == input_pointer_scalars_attr_name] assert len(found) == 1 @@ -414,9 +415,9 @@ def backward(ctx, *grads): @staticmethod def infer_shape( node: onnx.NodeProto, - tensor_input_shapes: List[Optional[List[Union[int, str]]]], - tensor_input_dtypes: List[torch.onnx.TensorProtoDataType], - ) -> Tuple[List[Optional[List[Union[int, str]]]], List[torch.onnx.TensorProtoDataType]]: + tensor_input_shapes: list[list[int | str] | None], + tensor_input_dtypes: list[torch.onnx.TensorProtoDataType], + ) -> tuple[list[list[int | str] | None], list[torch.onnx.TensorProtoDataType]]: return tensor_input_shapes, tensor_input_dtypes @staticmethod @@ -480,7 +481,7 @@ def pre_forward_module_apply_impl( module: torch.nn.Module, args: ORTModelInputOutputType, kwargs: ORTModelInputOutputType, - ) -> Tuple[ORTModelInputOutputType, ORTModelInputOutputType]: + ) -> tuple[ORTModelInputOutputType, ORTModelInputOutputType]: """This function is a dispatcher to call DeepSpeed stage3 pre forward hooks in sequence. All hook functions can be retrieved from the function store, due to exporter only supports a list of tensors as @@ -556,7 +557,7 @@ def post_forward_module_apply_impl( module: torch.nn.Module, args: ORTModelInputOutputType, outputs: ORTModelInputOutputType, - ) -> Tuple[ORTModelInputOutputType, ORTModelInputOutputType]: + ) -> tuple[ORTModelInputOutputType, ORTModelInputOutputType]: """This function is a dispatcher to call DeepSpeed stage3 post forward hooks in sequence. All hook functions can be retrieved from function store, due to exporter only supports a list of tensors as @@ -615,7 +616,7 @@ def post_forward_outmost_module_apply_impl( module: torch.nn.Module, args: ORTModelInputOutputType, outputs: ORTModelInputOutputType, - ) -> Tuple[ORTModelInputOutputType, ORTModelInputOutputType]: + ) -> tuple[ORTModelInputOutputType, ORTModelInputOutputType]: outputs_tensors, outputs_schema = extract_data_and_schema(outputs) _end_of_forward_hook = self._functions.get("_end_of_forward_hook") @@ -636,7 +637,7 @@ def post_forward_outmost_module_apply_impl( return args, updated_outputs @nvtx_function_decorator - def _check_all_tensor(self, tensor_list: Tuple[torch.Tensor], module: torch.nn.Module, name: str): + def _check_all_tensor(self, tensor_list: tuple[torch.Tensor], module: torch.nn.Module, name: str): if not self._enable_debug_info: return diff --git a/orttraining/orttraining/python/training/utils/ptable.py b/orttraining/orttraining/python/training/utils/ptable.py index 5e06864800666..c3e022f252e13 100644 --- a/orttraining/orttraining/python/training/utils/ptable.py +++ b/orttraining/orttraining/python/training/utils/ptable.py @@ -3,14 +3,12 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- -from typing import List - class Row: """A row in a PTable""" - def __init__(self, columns: List[str]) -> None: - self._columns: List[str] = columns # List of strings + def __init__(self, columns: list[str]) -> None: + self._columns: list[str] = columns # List of strings self._annotation_table = None # Optional PTable used for displaying detailed information about the feature row. def append_annotation_table(self, ptable) -> None: @@ -21,11 +19,11 @@ class PTable: """A table that can be printed to the console.""" def __init__(self, sortable=False) -> None: - self._rows: List[Row] = [] + self._rows: list[Row] = [] self._column_count = None self._sortable = sortable # allow the rows to be sorted by the first column - def add_row(self, columns: List[str]) -> Row: + def add_row(self, columns: list[str]) -> Row: """Add a row to the table. The number of columns must match the number of columns in the table.""" if self._column_count is None: self._column_count = len(columns) diff --git a/orttraining/orttraining/python/training/utils/torch_io_helper.py b/orttraining/orttraining/python/training/utils/torch_io_helper.py index a6aa390a3ed35..f0cf09d91b81e 100644 --- a/orttraining/orttraining/python/training/utils/torch_io_helper.py +++ b/orttraining/orttraining/python/training/utils/torch_io_helper.py @@ -6,7 +6,7 @@ import copy import warnings from collections import OrderedDict, abc -from typing import List, Mapping, Optional, Sequence, Tuple, Union +from collections.abc import Mapping, Sequence import torch @@ -37,16 +37,16 @@ def get_primitive_dtype(value): # Data types supported as model inputs and outputs. -ORTModelInputOutputType = Union[ - None, - str, - int, - bool, - float, - torch.Tensor, - Sequence["ORTModelInputOutputType"], - Mapping[str, "ORTModelInputOutputType"], -] +ORTModelInputOutputType = ( + str + | int + | bool + | float + | torch.Tensor + | Sequence["ORTModelInputOutputType"] + | Mapping[str, "ORTModelInputOutputType"] + | None +) class _TensorStub: @@ -57,16 +57,16 @@ class _TensorStub: def __init__( self, tensor_idx: int, - name: Optional[str] = None, - dtype: Optional[str] = None, + name: str | None = None, + dtype: str | None = None, shape=None, - shape_dims: Optional[int] = None, + shape_dims: int | None = None, ): self.tensor_idx = tensor_idx - self.name: Optional[str] = name - self.dtype: Optional[str] = dtype + self.name: str | None = name + self.dtype: str | None = dtype self.shape = shape - self.shape_dims: Optional[int] = shape_dims # r.g. rank. + self.shape_dims: int | None = shape_dims # r.g. rank. def __repr__(self) -> str: result = "_TensorStub(" @@ -108,13 +108,9 @@ def __eq__(self, other): # Data schema used to represent model's input or output. -ORTModelInputOutputSchemaType = Union[ - None, - str, - _TensorStub, - Sequence["ORTModelInputOutputSchemaType"], - Mapping[str, "ORTModelInputOutputSchemaType"], -] +ORTModelInputOutputSchemaType = ( + str | _TensorStub | Sequence["ORTModelInputOutputSchemaType"] | Mapping[str, "ORTModelInputOutputSchemaType"] | None +) def _warn_of_constant_inputs(data): @@ -126,8 +122,8 @@ def _warn_of_constant_inputs(data): @nvtx_function_decorator def extract_data_and_schema( - data: ORTModelInputOutputType, constant_as_tensor=False, device: Optional[torch.device] = None -) -> Tuple[List[torch.Tensor], ORTModelInputOutputSchemaType]: + data: ORTModelInputOutputType, constant_as_tensor=False, device: torch.device | None = None +) -> tuple[list[torch.Tensor], ORTModelInputOutputSchemaType]: """Extract the data schema by replacing every torch.Tensor value with _TensorStub, and return all tensors in a list. @@ -235,7 +231,7 @@ def _flatten_from_data(data: ORTModelInputOutputType, prefix_name: str = ""): @nvtx_function_decorator def unflatten_data_using_schema( - data: List[torch.Tensor], schema: ORTModelInputOutputSchemaType + data: list[torch.Tensor], schema: ORTModelInputOutputSchemaType ) -> ORTModelInputOutputType: """Follows the schema to generate an output that is expected by the user. @@ -280,7 +276,7 @@ def unflatten_data_using_schema( """ - def _replace_stub_with_tensor_value(data_schema: ORTModelInputOutputSchemaType, data: List[torch.Tensor]): + def _replace_stub_with_tensor_value(data_schema: ORTModelInputOutputSchemaType, data: list[torch.Tensor]): # Recursively traverse across user_output and replace all _TensorStub # with torch.Tensor values from outputs following output_idx diff --git a/orttraining/orttraining/python/training/utils/torch_type_map.py b/orttraining/orttraining/python/training/utils/torch_type_map.py index 2b429f3fd4f3a..49c3b32fc5037 100644 --- a/orttraining/orttraining/python/training/utils/torch_type_map.py +++ b/orttraining/orttraining/python/training/utils/torch_type_map.py @@ -4,8 +4,6 @@ # -------------------------------------------------------------------------- -from typing import Union - import torch # Mapping from pytorch scalar type to onnx scalar type. @@ -36,7 +34,7 @@ _ONNX_TO_DTYPE = {onnx_dtype: torch_dtype for torch_dtype, onnx_dtype in _DTYPE_TO_ONNX.items()} -def pytorch_type_to_onnx_dtype(dtype_or_scalar_type: Union[torch.dtype, str]) -> torch.onnx.TensorProtoDataType: +def pytorch_type_to_onnx_dtype(dtype_or_scalar_type: torch.dtype | str) -> torch.onnx.TensorProtoDataType: """Converts a pytorch dtype or scalar type string to an onnx dtype. PyTorch type can be either a dtype or a scalar type string. """ diff --git a/orttraining/orttraining/test/python/_test_helpers.py b/orttraining/orttraining/test/python/_test_helpers.py index 3d75b3f98862e..1dd304549869d 100644 --- a/orttraining/orttraining/test/python/_test_helpers.py +++ b/orttraining/orttraining/test/python/_test_helpers.py @@ -95,7 +95,7 @@ def assert_gradients_match_and_reset_gradient( pt_named_params = list(pt_model.named_parameters()) assert len(ort_named_params) == len(pt_named_params) - for ort_named_param, pt_named_param in zip(ort_named_params, pt_named_params): + for ort_named_param, pt_named_param in zip(ort_named_params, pt_named_params, strict=False): ort_name, ort_param = ort_named_param pt_name, pt_param = pt_named_param @@ -180,7 +180,7 @@ def run_with_ort_on_device(device, model, input_list, label_input, is_eval_mode= def compare_tensor_list(val_list_a, val_list_b): - for val_a, val_b in zip(val_list_a, val_list_b): + for val_a, val_b in zip(val_list_a, val_list_b, strict=False): assert_values_are_close(val_a, val_b, atol=1e-7, rtol=1e-6) diff --git a/orttraining/orttraining/test/python/orttraining_test_dort.py b/orttraining/orttraining/test/python/orttraining_test_dort.py index bd36ebf545be6..759af0854145f 100644 --- a/orttraining/orttraining/test/python/orttraining_test_dort.py +++ b/orttraining/orttraining/test/python/orttraining_test_dort.py @@ -162,7 +162,7 @@ def run(fun, seed: torch.Tensor): # ORT result. tensors = run(optimized_elementwise_model, seed) - for tensor, baseline_tensor in zip(tensors, baseline_tensors): + for tensor, baseline_tensor in zip(tensors, baseline_tensors, strict=False): torch.testing.assert_close(tensor, baseline_tensor) assert len(cached.keys()) == 2, ( @@ -182,7 +182,7 @@ def run(fun, seed: torch.Tensor): # ORT result. tensors = run(optimized_elementwise_model, seed) - for tensor, baseline_tensor in zip(tensors, baseline_tensors): + for tensor, baseline_tensor in zip(tensors, baseline_tensors, strict=False): torch.testing.assert_close(tensor, baseline_tensor) # 4 GraphModule's respectively for @@ -369,7 +369,7 @@ def run(model, tensor_x, tensor_y): print(f"MNIST loss: {loss} (pytorch), {loss_new} (ort).") torch.testing.assert_close(loss, loss_new, rtol=1e-2, atol=1e-5) - for grad, grad_new in zip(grads, grads_new): + for grad, grad_new in zip(grads, grads_new, strict=False): torch.testing.assert_close(grad, grad_new) # Run 5 times because ORT runs have side effects and we want to make sure diff --git a/orttraining/orttraining/test/python/orttraining_test_experimental_gradient_graph.py b/orttraining/orttraining/test/python/orttraining_test_experimental_gradient_graph.py index dd26448f0c596..07a9ab3a1d1cf 100644 --- a/orttraining/orttraining/test/python/orttraining_test_experimental_gradient_graph.py +++ b/orttraining/orttraining/test/python/orttraining_test_experimental_gradient_graph.py @@ -92,7 +92,7 @@ def test_save(self): ort_outs = ort_session.run(None, ort_inputs) onnx_output_names = [node.name for node in onnx_model.graph.output] - onnx_name_to_output = dict(zip(onnx_output_names, ort_outs)) + onnx_name_to_output = dict(zip(onnx_output_names, ort_outs, strict=False)) ort_output = onnx_name_to_output["output"] np.testing.assert_allclose(to_numpy(torch_out), ort_output, rtol=1e-03, atol=1e-05) diff --git a/orttraining/orttraining/test/python/orttraining_test_gru.py b/orttraining/orttraining/test/python/orttraining_test_gru.py index fcb7e13b1694f..0693b2ada447b 100644 --- a/orttraining/orttraining/test/python/orttraining_test_gru.py +++ b/orttraining/orttraining/test/python/orttraining_test_gru.py @@ -666,7 +666,7 @@ def test_gru_forward(sequence_length, batch_size, input_size, hidden_size, linea outs_ort = gru.forward_ort(inputs, weights, recurrence_weights, bias, initial_hidden_state) outs_np = gru.forward_np(inputs, weights, recurrence_weights, bias, initial_hidden_state) - for ort_out, np_out in zip(outs_ort, outs_np): + for ort_out, np_out in zip(outs_ort, outs_np, strict=False): assert np.allclose(ort_out, np_out, rtol=1e-03, atol=1e-05) @@ -716,5 +716,5 @@ def test_gru_backward(sequence_length, batch_size, input_size, hidden_size, line grad_final_hidden_state, ) - for ort_out, np_out in zip(outs_ort, outs_np): + for ort_out, np_out in zip(outs_ort, outs_np, strict=False): assert np.allclose(ort_out, np_out, rtol=1e-01, atol=1e-03) diff --git a/orttraining/orttraining/test/python/orttraining_test_hierarchical_ortmodule.py b/orttraining/orttraining/test/python/orttraining_test_hierarchical_ortmodule.py index 655c9def2c66c..ff1c4dc8aad13 100644 --- a/orttraining/orttraining/test/python/orttraining_test_hierarchical_ortmodule.py +++ b/orttraining/orttraining/test/python/orttraining_test_hierarchical_ortmodule.py @@ -200,7 +200,7 @@ def call_backward(y): def call_allclose(y, y_ref): assert type(y) is type(y_ref) if isinstance(y, Iterable): - for ele, ele_ref in zip(y, y_ref): + for ele, ele_ref in zip(y, y_ref, strict=False): torch.allclose(ele, ele_ref) else: torch.allclose(y, y_ref) diff --git a/orttraining/orttraining/test/python/orttraining_test_lort.py b/orttraining/orttraining/test/python/orttraining_test_lort.py index ccd06e1a3ab62..3aca181edcfc2 100644 --- a/orttraining/orttraining/test/python/orttraining_test_lort.py +++ b/orttraining/orttraining/test/python/orttraining_test_lort.py @@ -101,7 +101,7 @@ def run(model, device, x, y): print(f"MNIST loss: {loss} (pytorch), {loss_new} (ort).") torch.testing.assert_close(loss.to("lazy"), loss_new, rtol=1e-2, atol=1e-5) - for g, g_new in zip(grads, grads_new): + for g, g_new in zip(grads, grads_new, strict=False): torch.testing.assert_close(g.to("lazy"), g_new) for _ in range(5): diff --git a/orttraining/orttraining/test/python/orttraining_test_lstm.py b/orttraining/orttraining/test/python/orttraining_test_lstm.py index 1d75f12801fba..57fb6c4d1985b 100644 --- a/orttraining/orttraining/test/python/orttraining_test_lstm.py +++ b/orttraining/orttraining/test/python/orttraining_test_lstm.py @@ -867,7 +867,7 @@ def test_lstm_forward(sequence_length, batch_size, input_size, hidden_size): inputs, weights, recurrence_weights, bias, initial_hidden_state, initial_cell_state, peephole_weights ) - for ort_out, np_out in zip(outs_ort, outs_np): + for ort_out, np_out in zip(outs_ort, outs_np, strict=False): assert np.allclose(ort_out, np_out, rtol=1e-03, atol=1e-05) @@ -933,5 +933,5 @@ def test_lstm_backward(sequence_length, batch_size, input_size, hidden_size): grad_final_cell_state, ) - for ort_out, np_out in zip(outs_ort, outs_np): + for ort_out, np_out in zip(outs_ort, outs_np, strict=False): assert np.allclose(ort_out, np_out, rtol=1e-03, atol=1e-05) diff --git a/orttraining/orttraining/test/python/orttraining_test_ort_apis_onnxblock.py b/orttraining/orttraining/test/python/orttraining_test_ort_apis_onnxblock.py index 275d53daec889..d8f2ae2a5bcee 100644 --- a/orttraining/orttraining/test/python/orttraining_test_ort_apis_onnxblock.py +++ b/orttraining/orttraining/test/python/orttraining_test_ort_apis_onnxblock.py @@ -605,7 +605,7 @@ def test_retrieve_parameters(): # Then assert not non_trainable_params - for ort_param, (pt_param_name, pt_param) in zip(trainable_params, pt_model.named_parameters()): + for ort_param, (pt_param_name, pt_param) in zip(trainable_params, pt_model.named_parameters(), strict=False): assert ort_param.name == pt_param_name assert np.allclose( np.frombuffer(ort_param.raw_data, dtype=np.float32).reshape(pt_param.shape), @@ -853,7 +853,7 @@ def mse_loss(prediction, target): ort_outs = ort_session.run([ort_output_names], ort_inputs) # assert all the gradients are close - for ort_grad, pt_param in zip(ort_outs[0], pt_model.parameters()): + for ort_grad, pt_param in zip(ort_outs[0], pt_model.parameters(), strict=False): assert np.allclose(ort_grad, _to_numpy(pt_param.grad)) diff --git a/orttraining/orttraining/test/python/orttraining_test_ort_pipeline_module.py b/orttraining/orttraining/test/python/orttraining_test_ort_pipeline_module.py index d59e32cde33dd..8047e4217c6f9 100644 --- a/orttraining/orttraining/test/python/orttraining_test_ort_pipeline_module.py +++ b/orttraining/orttraining/test/python/orttraining_test_ort_pipeline_module.py @@ -1,5 +1,4 @@ import argparse -from typing import Dict, Tuple import deepspeed import torch @@ -39,14 +38,14 @@ def __init__(self, x: torch.Tensor, y: torch.Tensor): def __len__(self) -> int: return self.x.size(0) - def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]: + def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]: return self.x[idx], self.y[idx] class SimpleNetPipeInput(nn.Module): """First stage of the pipeline, responsible for initial processing.""" - def __init__(self, config: Dict[str, int]): + def __init__(self, config: dict[str, int]): super().__init__() self.linear = nn.Linear(config["input_size"], config["hidden_size"]) self.activation = nn.ReLU() @@ -60,7 +59,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class SimpleNetPipeBlock(nn.Module): """Intermediate stage of the pipeline, can be duplicated to deepen the network.""" - def __init__(self, config: Dict[str, int]): + def __init__(self, config: dict[str, int]): super().__init__() self.linear = nn.Linear(config["hidden_size"], config["hidden_size"]) self.activation = nn.ReLU() @@ -74,7 +73,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class SimpleNetPipeOutput(nn.Module): """Final stage of the pipeline, producing the output.""" - def __init__(self, config: Dict[str, int]): + def __init__(self, config: dict[str, int]): super().__init__() self.linear = nn.Linear(config["hidden_size"], config["output_size"]) @@ -83,7 +82,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return x -def build_model(config: Dict[str, int], n: int, layer_spec: bool) -> nn.Module: +def build_model(config: dict[str, int], n: int, layer_spec: bool) -> nn.Module: """Constructs and returns the model either using LayerSpec or nn.Sequential.""" if layer_spec: print("Wrapping layers with LayerSpec") diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py index 912af9bc88755..7eaa7d1d9cb5d 100644 --- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py +++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py @@ -4166,7 +4166,7 @@ def forward( out_ort = ort_model(*y) assert len(out_pt) == len(out_ort) - for x, y in zip(out_pt, out_ort): + for x, y in zip(out_pt, out_ort, strict=False): _test_helpers.assert_values_are_close(x, y) @@ -4257,7 +4257,7 @@ def test_hf_save_pretrained(): ).to(device) model2 = ORTModule(model2) - for p1, p2 in zip(model1.parameters(), model2.parameters()): + for p1, p2 in zip(model1.parameters(), model2.parameters(), strict=False): assert p1.data.ne(p2.data).sum() == 0 @@ -5123,7 +5123,7 @@ def run_optim_step(optimizer): pt_loss = run_step(pt_model, x1) ort_loss = run_step(ort_model, x2) - for pt_param, ort_param in zip(pt_model.parameters(), ort_model.parameters()): + for pt_param, ort_param in zip(pt_model.parameters(), ort_model.parameters(), strict=False): ort_param.grad = copy.deepcopy(pt_param.grad) _test_helpers.assert_values_are_close(pt_loss, ort_loss) @@ -5133,7 +5133,7 @@ def run_optim_step(optimizer): run_optim_step(transformers_adamw_optimizer) run_optim_step(ort_fused_adam_optimizer) - for pt_param, ort_param in zip(pt_model.parameters(), ort_model.parameters()): + for pt_param, ort_param in zip(pt_model.parameters(), ort_model.parameters(), strict=False): _test_helpers.assert_values_are_close(pt_param, ort_param, atol=1e-4, rtol=1e-5) @@ -5173,7 +5173,7 @@ def run_optim_step(optimizer): pt_loss = run_step(pt_model, x1) ort_loss = run_step(ort_model, x2) - for pt_param, ort_param in zip(pt_model.parameters(), ort_model.parameters()): + for pt_param, ort_param in zip(pt_model.parameters(), ort_model.parameters(), strict=False): ort_param.grad = copy.deepcopy(pt_param.grad) _test_helpers.assert_values_are_close(pt_loss, ort_loss, atol=1e-4, rtol=1e-5) @@ -5185,7 +5185,7 @@ def run_optim_step(optimizer): run_optim_step(adamw_optimizer) run_optim_step(ort_fused_adam_optimizer) - for pt_param, ort_param in zip(pt_model.parameters(), ort_model.parameters()): + for pt_param, ort_param in zip(pt_model.parameters(), ort_model.parameters(), strict=False): _test_helpers.assert_values_are_close(pt_param, ort_param, atol=1e-4, rtol=1e-5) @@ -5506,7 +5506,7 @@ def random_state_equal(a, b): assert type(a) is type(b) if isinstance(a, tuple): assert len(a) == len(b) - return all([random_state_equal(a_i, b_i) for a_i, b_i in zip(a, b)]) + return all([random_state_equal(a_i, b_i) for a_i, b_i in zip(a, b, strict=False)]) if isinstance(a, np.ndarray): return np.array_equal(a, b) if isinstance(a, torch.Tensor): @@ -6170,7 +6170,7 @@ def generate_inputs(batch_size, max_seq_length, vocab_size): run_optim_step(pt_optimizer) run_optim_step(ort_optimizer) - for pt_param, ort_param in zip(pt_model.parameters(), ort_model.parameters()): + for pt_param, ort_param in zip(pt_model.parameters(), ort_model.parameters(), strict=False): _test_helpers.assert_values_are_close(pt_param.grad, ort_param.grad, atol=1e-4, rtol=1e-5) if os.getenv("ORTMODULE_ROCM_TEST", "0") == "1": @@ -6394,7 +6394,7 @@ def run_step(model, x): pt_grads = run_step(pt_model, pt_x) ort_grads = run_step(ort_model, ort_x) - for pt_grad, ort_grad in zip(pt_grads, ort_grads): + for pt_grad, ort_grad in zip(pt_grads, ort_grads, strict=False): if use_fp16: assert torch.allclose(pt_grad, ort_grad, atol=1e-3, rtol=1e-3) else: @@ -6443,7 +6443,7 @@ def run_step(model, x): pt_grads = run_step(pt_model, pt_x) ort_grads = run_step(ort_model, ort_x) - for pt_grad, ort_grad in zip(pt_grads, ort_grads): + for pt_grad, ort_grad in zip(pt_grads, ort_grads, strict=False): assert torch.allclose(pt_grad, ort_grad) if conv_algo_search is not None: @@ -6489,7 +6489,7 @@ def run_step(model, x): pt_grads = run_step(pt_model, pt_x) ort_grads = run_step(ort_model, ort_x) - for pt_grad, ort_grad in zip(pt_grads, ort_grads): + for pt_grad, ort_grad in zip(pt_grads, ort_grads, strict=False): assert torch.allclose(pt_grad, ort_grad, atol=1e-2, rtol=1e-2) if conv_algo_search is not None: @@ -6917,7 +6917,7 @@ def generate_inputs(batch_size, max_seq_length, vocab_size): ort_model2 = ORTModule(copy.deepcopy(pt_model), DebugOptions(save_onnx=True, onnx_prefix="recompute")) ort_prediction2 = run_step(ort_model2, ort_input, ort_mask, ort_target) - for ort_param1, ort_param2 in zip(ort_model1.parameters(), ort_model2.parameters()): + for ort_param1, ort_param2 in zip(ort_model1.parameters(), ort_model2.parameters(), strict=False): _test_helpers.assert_values_are_close(ort_param1.grad, ort_param2.grad, atol=1e-4, rtol=1e-5) if os.getenv("ORTMODULE_ROCM_TEST", "0") == "1": diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd.py index 5764a6a81e5db..2e1c90bcac5cd 100644 --- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd.py +++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd.py @@ -7,7 +7,6 @@ import copy import os -from typing import Tuple import onnx import pytest @@ -264,13 +263,13 @@ def forward( ctx, input, alpha: float, - beta: Tuple[float, float], + beta: tuple[float, float], gamma: float, delta: bool, - epsilon: Tuple[bool, bool], + epsilon: tuple[bool, bool], zeta: int, - eta: Tuple[int, int], - theta: Tuple[float, float], + eta: tuple[int, int], + theta: tuple[float, float], ): ctx.save_for_backward(input) ctx.alpha = alpha @@ -296,7 +295,7 @@ def backward(ctx, grad_output): assert alpha == alpha_value assert isinstance(alpha, float) - assert all(a == b for a, b in zip(beta, beta_value)) + assert all(a == b for a, b in zip(beta, beta_value, strict=False)) assert all(isinstance(x, float) for x in beta) assert gamma == gamma_value @@ -305,16 +304,16 @@ def backward(ctx, grad_output): assert ctx.delta == delta_value assert isinstance(ctx.delta, bool) - assert all(a == b for a, b in zip(ctx.epsilon, epsilon_value)) + assert all(a == b for a, b in zip(ctx.epsilon, epsilon_value, strict=False)) assert all(isinstance(x, bool) for x in ctx.epsilon) assert ctx.zeta == zeta_value assert isinstance(ctx.zeta, int) - assert all(a == b for a, b in zip(ctx.eta, eta_value)) + assert all(a == b for a, b in zip(ctx.eta, eta_value, strict=False)) assert all(isinstance(x, int) for x in ctx.eta) - assert all(a == b for a, b in zip(ctx.theta, theta_value)) + assert all(a == b for a, b in zip(ctx.theta, theta_value, strict=False)) assert all(isinstance(x, float) for x in ctx.theta) return alpha * beta[0] * beta[1] * gamma * grad_input, None, None, None, None, None, None, None, None @@ -1651,7 +1650,7 @@ def _compare_shape(shape1, shape2): if len(shape1.dim) != len(shape2.dim): return False - for dim1, dim2 in zip(shape1.dim, shape2.dim): + for dim1, dim2 in zip(shape1.dim, shape2.dim, strict=False): if dim1.HasField("dim_value") and dim1.HasField("dim_value") and dim1.dim_value == dim2.dim_value: continue diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_onnx_ops.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_onnx_ops.py index 07d581b576c45..54e414b36c2ba 100644 --- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_onnx_ops.py +++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_onnx_ops.py @@ -30,7 +30,7 @@ def assert_gradients_match_and_reset_gradient( pt_named_params = list(pt_model.named_parameters()) self.assertEqual(len(ort_named_params), len(pt_named_params)) - for ort_named_param, pt_named_param in zip(ort_named_params, pt_named_params): + for ort_named_param, pt_named_param in zip(ort_named_params, pt_named_params, strict=False): ort_name, ort_param = ort_named_param pt_name, pt_param = pt_named_param diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_triton.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_triton.py index 0c381d70ca4c1..85b7180d97ff3 100644 --- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_triton.py +++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_triton.py @@ -206,7 +206,7 @@ def _run_op_test(op_type, onnx_dtype, create_model_func, gen_inputs_func, **kwar if isinstance(pt_outputs, tuple): assert isinstance(ort_outputs, tuple) assert len(pt_outputs) == len(ort_outputs) - for pt_output, ort_output in zip(pt_outputs, ort_outputs): + for pt_output, ort_output in zip(pt_outputs, ort_outputs, strict=False): _test_helpers.assert_values_are_close(pt_output, _from_dlpack(ort_output), rtol=rtol, atol=atol) else: _test_helpers.assert_values_are_close(pt_outputs, _from_dlpack(ort_outputs), rtol=rtol, atol=atol) @@ -489,7 +489,7 @@ def test_dropout_op(onnx_dtype, input_shape_and_ratio): def _check_output(x, y, mask, ratio): all_count = 0 masked_count = 0 - for x_value, y_value, mask_value in zip(x, y, mask): + for x_value, y_value, mask_value in zip(x, y, mask, strict=False): if mask_value: assert abs(y_value - x_value / (1.0 - ratio)) < 0.05 else: diff --git a/orttraining/orttraining/test/python/orttraining_test_ortvalue.py b/orttraining/orttraining/test/python/orttraining_test_ortvalue.py index 317efa0061865..327be44ed88c2 100644 --- a/orttraining/orttraining/test/python/orttraining_test_ortvalue.py +++ b/orttraining/orttraining/test/python/orttraining_test_ortvalue.py @@ -104,7 +104,7 @@ def testOrtValueVector_float32(self): vect.push_back(ortvalue._ortvalue) self.assertEqual(len(vect.bool_tensor_indices()), 0) self.assertEqual(len(vect), 2) - for i, (ov, ar) in enumerate(zip(vect, narrays)): + for i, (ov, ar) in enumerate(zip(vect, narrays, strict=False)): ovar = ov.numpy() assert_almost_equal(ar, ovar) self.assertEqual(ov.element_type(), vect.element_type_at(i)) @@ -120,7 +120,7 @@ def testOrtValueVector_bool(self): vect.push_back(ortvalue._ortvalue) self.assertEqual(vect.bool_tensor_indices(), [0, 1]) self.assertEqual(len(vect), 2) - for ov, ar in zip(vect, narrays): + for ov, ar in zip(vect, narrays, strict=False): ovar = ov.numpy() assert_almost_equal(ar, ovar) @@ -152,7 +152,7 @@ def OrtValueVectorDlPackOrtValue(self, my_to_tensor, tensor_type, device, dtype= self.assertEqual(cf, cf2) # it should be [3, 3] ptr2 = [] - for av1, v2 in zip(narrays, converted_values): + for av1, v2 in zip(narrays, converted_values, strict=False): ptr2.append(v2.data_ptr()) if hasattr(v2, "cpu"): av2 = v2.cpu().numpy() diff --git a/orttraining/orttraining/test/python/orttraining_test_sampler.py b/orttraining/orttraining/test/python/orttraining_test_sampler.py index 68f9ac5052134..0a6b54d972a46 100644 --- a/orttraining/orttraining/test/python/orttraining_test_sampler.py +++ b/orttraining/orttraining/test/python/orttraining_test_sampler.py @@ -54,7 +54,7 @@ def test_load_balancing_data_sampler_shuffles_and_balances_load(): random.shuffle(complexities) samples = [torch.FloatTensor([val]) for val in range(100)] - samples_and_complexities = list(zip(samples, complexities)) + samples_and_complexities = list(zip(samples, complexities, strict=False)) dataset = MyDataset(samples_and_complexities) def complexity_fn(sample): @@ -67,7 +67,7 @@ def complexity_fn(sample): dataset, complexity_fn=complexity_fn, world_size=2, rank=1, shuffle=True ) - for index0, index1 in zip(data_sampler0, data_sampler1): + for index0, index1 in zip(data_sampler0, data_sampler1, strict=False): assert samples_and_complexities[index0][1] == samples_and_complexities[index1][1] @@ -90,7 +90,7 @@ def complexity_fn(sample): dataset, complexity_fn=complexity_fn, world_size=1, rank=0, shuffle=False, group_size=8 ) - for index, sorted_sample in zip(data_sampler, samples_and_complexities_sorted): + for index, sorted_sample in zip(data_sampler, samples_and_complexities_sorted, strict=False): assert samples_and_complexities[index][1] == sorted_sample[1] @@ -127,7 +127,9 @@ def complexity_fn(sample): dataset, complexity_fn=complexity_fn, world_size=1, rank=0, shuffle=True, group_size=8 ) - for index, sorted_and_shuffled_sample in zip(data_sampler, samples_and_complexities_sorted_and_shuffled): + for index, sorted_and_shuffled_sample in zip( + data_sampler, samples_and_complexities_sorted_and_shuffled, strict=False + ): assert samples_and_complexities[index][1] == sorted_and_shuffled_sample[1] diff --git a/orttraining/orttraining/test/python/orttraining_test_utilities.py b/orttraining/orttraining/test/python/orttraining_test_utilities.py index faa04f327be7f..c3fc9c2d2577a 100644 --- a/orttraining/orttraining/test/python/orttraining_test_utilities.py +++ b/orttraining/orttraining/test/python/orttraining_test_utilities.py @@ -256,7 +256,12 @@ def _recursive_compare(real, expected): if flag == 0: out, schema = extract_data_and_schema(raw_data) - assert all([torch.allclose(o, d) if isinstance(o, torch.Tensor) else o == d for o, d in zip(out, flatten_data)]) + assert all( + [ + torch.allclose(o, d) if isinstance(o, torch.Tensor) else o == d + for o, d in zip(out, flatten_data, strict=False) + ] + ) if not isinstance(raw_data, torch.Tensor): assert type(schema) is type(raw_data) @@ -276,7 +281,7 @@ def _recursive_compare(real, expected): assert all( [ torch.allclose(o, d) if isinstance(o, torch.Tensor) else o == d - for o, d in zip(out, flatten_data_constant_as_tensor) + for o, d in zip(out, flatten_data_constant_as_tensor, strict=False) ] ) diff --git a/orttraining/tools/ci_test/compare_results.py b/orttraining/tools/ci_test/compare_results.py index 0ab0a1246a421..2d4a3d31dec41 100644 --- a/orttraining/tools/ci_test/compare_results.py +++ b/orttraining/tools/ci_test/compare_results.py @@ -43,7 +43,7 @@ def _compare_results(expected_results, actual_results, field_comparisons): return False mismatch_detected = False - for row_idx, (expected_row, actual_row) in enumerate(zip(expected_results, actual_results)): + for row_idx, (expected_row, actual_row) in enumerate(zip(expected_results, actual_results, strict=False)): for field_name, comparison in field_comparisons.items(): actual, expected = actual_row[field_name], expected_row[field_name] if not comparison.fn(actual, expected): diff --git a/orttraining/tools/scripts/nv_run_pretraining.py b/orttraining/tools/scripts/nv_run_pretraining.py index 8f399263e1e65..565f5af84d4fa 100644 --- a/orttraining/tools/scripts/nv_run_pretraining.py +++ b/orttraining/tools/scripts/nv_run_pretraining.py @@ -336,7 +336,7 @@ def prepare_model_and_optimizer(args, device): optimizer._lazy_init_maybe_master_weights() optimizer._amp_stash.lazy_init_called = True optimizer.load_state_dict(checkpoint["optimizer"]) - for param, saved_param in zip(amp.master_params(optimizer), checkpoint["master params"]): + for param, saved_param in zip(amp.master_params(optimizer), checkpoint["master params"], strict=False): param.data.copy_(saved_param.data) if args.local_rank != -1: diff --git a/pyproject.toml b/pyproject.toml index 60fe630b1378b..c30201e1b2745 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ reportMissingImports = false [tool.ruff] # NOTE: Do not create an exclude list. Edit .lintrunner.toml instead -target-version = "py38" +target-version = "py310" line-length = 120 [tool.ruff.lint] @@ -55,6 +55,7 @@ ignore = [ "SIM108", # We don't encourage ternary operators "SIM114", # Don't combine if branches for debugability "SIM116", # Don't use dict lookup to replace if-else + "UP038", # Using X | Y in isinstance checks is a little aggresive ] ignore-init-module-imports = true diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 87180a242e370..865d1a0c58323 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -2,6 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates # Licensed under the MIT License. +from __future__ import annotations import argparse import contextlib @@ -1387,7 +1388,7 @@ def generate_build_tree( if not all(needed_args): raise BuildError( "iOS/MacOS framework build on MacOS canceled due to missing arguments: " - + ", ".join(val for val, cond in zip(arg_names, needed_args) if not cond) + + ", ".join(val for val, cond in zip(arg_names, needed_args, strict=False) if not cond) ) # note: this value is mainly used in framework_info.json file to specify the build osx type platform_name = "macabi" if args.macos == "Catalyst" else args.apple_sysroot diff --git a/tools/ci_build/github/apple/package_assembly_utils.py b/tools/ci_build/github/apple/package_assembly_utils.py index c6822466d73d0..829bca8c743df 100644 --- a/tools/ci_build/github/apple/package_assembly_utils.py +++ b/tools/ci_build/github/apple/package_assembly_utils.py @@ -7,7 +7,6 @@ import pathlib import re import shutil -from typing import Dict, List _script_dir = pathlib.Path(__file__).parent.resolve(strict=True) repo_root = _script_dir.parents[3] @@ -30,7 +29,7 @@ def all_variant_names(cls): def gen_file_from_template( - template_file: pathlib.Path, output_file: pathlib.Path, variable_substitutions: Dict[str, str], strict: bool = True + template_file: pathlib.Path, output_file: pathlib.Path, variable_substitutions: dict[str, str], strict: bool = True ): """ Generates a file from a template file. @@ -69,7 +68,7 @@ def replace_template_variable(match): output.write(content) -def filter_files(all_file_patterns: List[str], excluded_file_patterns: List[str]): +def filter_files(all_file_patterns: list[str], excluded_file_patterns: list[str]): """ Filters file paths based on inclusion and exclusion patterns @@ -90,7 +89,7 @@ def filter_files(all_file_patterns: List[str], excluded_file_patterns: List[str] return list(set(all_files) - set(exclude_files)) -def copy_repo_relative_to_dir(patterns: List[str], dest_dir: pathlib.Path): +def copy_repo_relative_to_dir(patterns: list[str], dest_dir: pathlib.Path): """ Copies file paths relative to the repo root to a directory. The given paths or path patterns are relative to the repo root, and the diff --git a/tools/ci_build/op_registration_utils.py b/tools/ci_build/op_registration_utils.py index 811ce424eae10..d404224a35eea 100644 --- a/tools/ci_build/op_registration_utils.py +++ b/tools/ci_build/op_registration_utils.py @@ -5,10 +5,11 @@ Utilities to help process files containing kernel registrations. """ +from __future__ import annotations + import os import pathlib import sys -import typing from logger import get_logger @@ -88,12 +89,12 @@ class RegistrationProcessor: def process_registration( self, - lines: typing.List[str], + lines: list[str], domain: str, operator: str, start_version: int, - end_version: typing.Optional[int] = None, - type: typing.Optional[str] = None, + end_version: int | None = None, + type: str | None = None, ): """ Process lines that contain a kernel registration. @@ -119,7 +120,7 @@ def ok(self): return False # return False as the derived class must override to report the real status -def _process_lines(lines: typing.List[str], offset: int, registration_processor: RegistrationProcessor): +def _process_lines(lines: list[str], offset: int, registration_processor: RegistrationProcessor): """ Process one or more lines that contain a kernel registration. Merge lines if split over multiple, and call registration_processor.process_registration with the original lines @@ -236,9 +237,7 @@ def _process_lines(lines: typing.List[str], offset: int, registration_processor: return offset + 1 -def process_kernel_registration_file( - filename: typing.Union[str, pathlib.Path], registration_processor: RegistrationProcessor -): +def process_kernel_registration_file(filename: str | pathlib.Path, registration_processor: RegistrationProcessor): """ Process a kernel registration file using registration_processor. :param filename: Path to file containing kernel registrations. diff --git a/tools/ci_build/op_registration_validator.py b/tools/ci_build/op_registration_validator.py index d92050a31f967..6cc7f3bb5ec6d 100644 --- a/tools/ci_build/op_registration_validator.py +++ b/tools/ci_build/op_registration_validator.py @@ -5,6 +5,8 @@ Validate ORT kernel registrations. """ +from __future__ import annotations + import argparse import dataclasses import itertools @@ -37,8 +39,8 @@ class RegistrationInfo: domain: str operator: str start_version: int - end_version: typing.Optional[int] - lines: typing.List[str] + end_version: int | None + lines: list[str] def domain_and_op_str(self): return f"{self.domain}:{self.operator}" @@ -50,16 +52,16 @@ def _log_registration_error(r: RegistrationInfo, message: str): class RegistrationValidator(op_registration_utils.RegistrationProcessor): def __init__(self): - self.all_registrations: typing.List[RegistrationInfo] = [] + self.all_registrations: list[RegistrationInfo] = [] def process_registration( self, - lines: typing.List[str], + lines: list[str], domain: str, operator: str, start_version: int, - end_version: typing.Optional[int] = None, - type: typing.Optional[str] = None, + end_version: int | None = None, + type: str | None = None, ): self.all_registrations.append( RegistrationInfo( @@ -114,7 +116,7 @@ def _validate_registrations_for_domain_and_op(self, registrations: typing.Iterat return num_invalid_registrations - def _validate_registration(self, r: RegistrationInfo, next_r: typing.Optional[RegistrationInfo]) -> bool: + def _validate_registration(self, r: RegistrationInfo, next_r: RegistrationInfo | None) -> bool: """ Validates a registration, `r`, with the next one in sorted order for a single domain and op, `next_r`, and returns whether it is valid. diff --git a/tools/ci_build/patch_manylinux.py b/tools/ci_build/patch_manylinux.py index 0d1cb37cc40ac..af03b594d9a69 100644 --- a/tools/ci_build/patch_manylinux.py +++ b/tools/ci_build/patch_manylinux.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +from __future__ import annotations import argparse import os diff --git a/tools/ci_build/reduce_op_kernels.py b/tools/ci_build/reduce_op_kernels.py index df6bbf7a4058e..f4f5cde3ddf7d 100755 --- a/tools/ci_build/reduce_op_kernels.py +++ b/tools/ci_build/reduce_op_kernels.py @@ -1,6 +1,7 @@ # !/usr/bin/env python3 # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +from __future__ import annotations import argparse import io @@ -28,7 +29,7 @@ def _adapt_filters_for_extended_minimal_build( - base_required_ops: typing.Optional[dict], base_op_type_impl_filter: typing.Optional[OpTypeImplFilterInterface] + base_required_ops: dict | None, base_op_type_impl_filter: OpTypeImplFilterInterface | None ): """ Adapts the values returned by parse_config() for an extended minimal build or higher. @@ -77,7 +78,7 @@ class _AdaptedFilter(OpTypeImplFilterInterface): def __init__( self, filter_to_adapt: OpTypeImplFilterInterface, - required_domain_and_optypes: typing.Set[typing.Tuple[str, str]], + required_domain_and_optypes: set[tuple[str, str]], ): self.filter_to_adapt = filter_to_adapt self.required_domain_and_optypes = required_domain_and_optypes @@ -107,17 +108,15 @@ class _ExcludingRegistrationProcessor(op_registration_utils.RegistrationProcesso def __init__( self, - required_ops: typing.Optional[dict], - op_type_impl_filter: typing.Optional[OpTypeImplFilterInterface], + required_ops: dict | None, + op_type_impl_filter: OpTypeImplFilterInterface | None, output_file: io.TextIOWrapper, ): self._required_ops = required_ops self._op_type_impl_filter = op_type_impl_filter self._output_file = output_file - def _is_op_required( - self, domain: str, operator: str, start_version: int, end_version: typing.Optional[int] - ) -> bool: + def _is_op_required(self, domain: str, operator: str, start_version: int, end_version: int | None) -> bool: """See if an op is required.""" if self._required_ops is None: return True @@ -134,12 +133,12 @@ def _is_op_required( def process_registration( self, - lines: typing.List[str], + lines: list[str], constant_for_domain: str, operator: str, start_version: int, - end_version: typing.Optional[int] = None, - type: typing.Optional[str] = None, + end_version: int | None = None, + type: str | None = None, ): registration_identifier = "{}:{}({}){}".format( constant_for_domain, operator, start_version, f"<{type}>" if type else "" @@ -202,8 +201,8 @@ def _generate_provider_registrations( ort_root: Path, build_dir: Path, use_cuda: bool, - required_ops: typing.Optional[dict], - op_type_impl_filter: typing.Optional[OpTypeImplFilterInterface], + required_ops: dict | None, + op_type_impl_filter: OpTypeImplFilterInterface | None, ): """Generate provider registration files.""" kernel_registration_files = [ diff --git a/tools/ci_build/replace_urls_in_deps.py b/tools/ci_build/replace_urls_in_deps.py index 37dad358a6feb..2569b20fb44a5 100644 --- a/tools/ci_build/replace_urls_in_deps.py +++ b/tools/ci_build/replace_urls_in_deps.py @@ -4,6 +4,7 @@ # This file replaces https URLs in deps.txt to local file paths. It runs after we download the dependencies from Azure # DevOps Artifacts +from __future__ import annotations import argparse import csv diff --git a/tools/ci_build/set-trigger-rules.py b/tools/ci_build/set-trigger-rules.py index ae95d30936b83..78f59452d1284 100644 --- a/tools/ci_build/set-trigger-rules.py +++ b/tools/ci_build/set-trigger-rules.py @@ -4,7 +4,7 @@ # -------------------------------------------------------------------------- # This script is used to add trigger rules to the workflow files. - +from __future__ import annotations import multiprocessing import os diff --git a/tools/ci_build/update_tsaoptions.py b/tools/ci_build/update_tsaoptions.py index 07be746aa1981..394a45cc4ee3b 100644 --- a/tools/ci_build/update_tsaoptions.py +++ b/tools/ci_build/update_tsaoptions.py @@ -1,5 +1,6 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +from __future__ import annotations import json import os diff --git a/tools/ci_build/upload_python_package_to_azure_storage.py b/tools/ci_build/upload_python_package_to_azure_storage.py index 16ff5d1f71611..c90ec1aa92b6b 100755 --- a/tools/ci_build/upload_python_package_to_azure_storage.py +++ b/tools/ci_build/upload_python_package_to_azure_storage.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +from __future__ import annotations import argparse import logging diff --git a/tools/python/find_optimizer_opset_version_updates_required.py b/tools/python/find_optimizer_opset_version_updates_required.py index b46f7e4a54d9c..3c7d94b8ba038 100644 --- a/tools/python/find_optimizer_opset_version_updates_required.py +++ b/tools/python/find_optimizer_opset_version_updates_required.py @@ -7,7 +7,6 @@ import logging import os import re -import typing logging.basicConfig(format="[%(levelname)s] - %(message)s", level=logging.DEBUG) log = logging.getLogger() @@ -30,7 +29,7 @@ def parse_args(): return args -def get_call_args_from_file(filename: str, function_or_declaration: str) -> typing.List[str]: +def get_call_args_from_file(filename: str, function_or_declaration: str) -> list[str]: """ Search a file for all function calls or declarations that match the provided name. Requires both the opening '(' and closing ')' to be on the same line. @@ -63,7 +62,7 @@ def get_call_args_from_file(filename: str, function_or_declaration: str) -> typi return results -def get_multiline_call_args_from_file(filename: str, function_or_declaration: str) -> typing.List[str]: +def get_multiline_call_args_from_file(filename: str, function_or_declaration: str) -> list[str]: """ Search a file for all function calls or declarations that match the provided name. Allows the opening '(' and closing ')' to be split across multiple lines. @@ -96,7 +95,7 @@ def get_multiline_call_args_from_file(filename: str, function_or_declaration: st return results -def _add_if_newer(domain: str, op: str, opset: int, op_to_opset: typing.Dict[str, int]): +def _add_if_newer(domain: str, op: str, opset: int, op_to_opset: dict[str, int]): key = domain + "." + op if key not in op_to_opset or op_to_opset[key] < opset: op_to_opset[key] = opset diff --git a/tools/python/gen_contrib_doc.py b/tools/python/gen_contrib_doc.py index ce6f0a1205fdc..c190ef3b0ba7d 100644 --- a/tools/python/gen_contrib_doc.py +++ b/tools/python/gen_contrib_doc.py @@ -8,10 +8,8 @@ import pathlib import sys from collections import defaultdict -from typing import Any, Dict, List, Sequence, Set, Text, Tuple # noqa: F401 -import numpy as np # type: ignore -from onnx import AttributeProto, FunctionProto # noqa: F401 +import numpy as np import onnxruntime.capi.onnxruntime_pybind11_state as rtpy from onnxruntime.capi.onnxruntime_pybind11_state import schemadef # noqa: F401 @@ -305,11 +303,6 @@ def support_level_str(level): # type: (OpSchema.SupportType) -> Text return "experimental " if level == OpSchema.SupportType.EXPERIMENTAL else "" -# def function_status_str(status=OperatorStatus.Value("EXPERIMENTAL")): # type: ignore -# return \ -# "experimental " if status == OperatorStatus.Value('EXPERIMENTAL') else "" # type: ignore - - def main(output_path: str, domain_filter: [str]): with open(output_path, "w", newline="", encoding="utf-8") as fout: fout.write("## Contrib Operator Schemas\n") diff --git a/tools/python/onnx2tfevents.py b/tools/python/onnx2tfevents.py index 9dfde13090b07..909bc04817ff1 100644 --- a/tools/python/onnx2tfevents.py +++ b/tools/python/onnx2tfevents.py @@ -13,7 +13,7 @@ import inspect import itertools from abc import ABC, abstractmethod -from typing import Callable, List +from collections.abc import Callable import numpy as np import onnx @@ -203,7 +203,7 @@ def _add_sections(self, name: str) -> None: if len(sec) > 0: self.sections.add(sec) - def _get_sections(self, curr_name: str, sections: List[str]) -> None: + def _get_sections(self, curr_name: str, sections: list[str]) -> None: for section in self.sections: if curr_name.startswith(section) and (len(curr_name) == len(section) or curr_name[len(section)] == "."): sections.append(section) @@ -217,8 +217,7 @@ def _transform_name(self, name: str) -> str: if "/" in name: if name.startswith(f"/{self.original_module_name}/"): name = name[len(self.original_module_name) + 2 :] - if name.startswith("/"): - name = name[1:] + name = name.removeprefix("/") return name sections = [] diff --git a/tools/python/ort_test_dir_utils.py b/tools/python/ort_test_dir_utils.py index 3af407b2aeee6..59bb6670c8794 100644 --- a/tools/python/ort_test_dir_utils.py +++ b/tools/python/ort_test_dir_utils.py @@ -159,7 +159,7 @@ def save_data(prefix, name_data_map, model_info): sess = ort.InferenceSession(test_model_filename, so) outputs = sess.run(output_names, name_input_map) name_output_map = {} - for name, data in zip(output_names, outputs): + for name, data in zip(output_names, outputs, strict=False): name_output_map[name] = data save_data("output", name_output_map, model_outputs) diff --git a/tools/python/run_CIs_for_branch.py b/tools/python/run_CIs_for_branch.py index 975ea2b988d75..b8d9b9d9d5f72 100644 --- a/tools/python/run_CIs_for_branch.py +++ b/tools/python/run_CIs_for_branch.py @@ -1,13 +1,13 @@ #!/usr/bin/env python3 # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +from __future__ import annotations import argparse import json import os import subprocess import sys -import typing from run_CIs_for_external_pr import get_pipeline_names from util.platform_helpers import is_windows @@ -78,7 +78,7 @@ def _parse_args(): return args -def _run_az_pipelines_command(command: typing.List[str]): +def _run_az_pipelines_command(command: list[str]): try: az = "az.cmd" if is_windows() else "az" az_output = subprocess.run([az, "pipelines", *command], capture_output=True, text=True, check=True) diff --git a/tools/python/run_CIs_for_external_pr.py b/tools/python/run_CIs_for_external_pr.py index 228c8016170d9..cee32073fa473 100644 --- a/tools/python/run_CIs_for_external_pr.py +++ b/tools/python/run_CIs_for_external_pr.py @@ -1,13 +1,13 @@ #!/usr/bin/env python3 # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +from __future__ import annotations import argparse import json import os import subprocess import sys -import typing def get_pipeline_names(): @@ -72,7 +72,7 @@ def _parse_args(): return args -def run_gh_pr_command(command: typing.List[str], check: bool = True): +def run_gh_pr_command(command: list[str], check: bool = True): try: return subprocess.run(["gh", "pr", *command], capture_output=True, text=True, check=check) except subprocess.CalledProcessError as cpe: diff --git a/tools/python/run_adb.py b/tools/python/run_adb.py index 7506a8699df05..aefdb2344d050 100755 --- a/tools/python/run_adb.py +++ b/tools/python/run_adb.py @@ -1,17 +1,17 @@ #!/usr/bin/env python3 # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +from __future__ import annotations import logging import os import sys -import typing from util import run from util.android import get_sdk_tool_paths -def run_adb(android_sdk_root: str, args: typing.List[str]): +def run_adb(android_sdk_root: str, args: list[str]): sdk_tool_paths = get_sdk_tool_paths(android_sdk_root) run(sdk_tool_paths.adb, *args) diff --git a/tools/python/run_android_emulator.py b/tools/python/run_android_emulator.py index 2826921726556..6d7c29fc58296 100755 --- a/tools/python/run_android_emulator.py +++ b/tools/python/run_android_emulator.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +from __future__ import annotations import argparse import contextlib diff --git a/tools/python/sparsify_initializers.py b/tools/python/sparsify_initializers.py index 2c80b07cd0a12..14f2e0b62c069 100644 --- a/tools/python/sparsify_initializers.py +++ b/tools/python/sparsify_initializers.py @@ -5,15 +5,15 @@ # This script opens an existing model in onnx format and attempts to # move initializers from model.graph.initializer field to model.graph.sparse_initializer field # and convert them into ONNX COO flat index format. +from __future__ import annotations import argparse import logging import sys -from typing import List, Tuple # noqa: F401 import numpy as np import onnx -from onnx import ModelProto, SparseTensorProto, TensorProto, numpy_helper # noqa: F401 +from onnx import ModelProto, TensorProto, numpy_helper logger = logging.getLogger(__name__) diff --git a/tools/python/util/android/android.py b/tools/python/util/android/android.py index 24004d6be761d..8f3ed97cae53f 100644 --- a/tools/python/util/android/android.py +++ b/tools/python/util/android/android.py @@ -1,5 +1,6 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +from __future__ import annotations import collections import contextlib @@ -108,7 +109,7 @@ def _stop_process_with_pid(pid: int): def start_emulator( sdk_tool_paths: SdkToolPaths, avd_name: str, - extra_args: typing.Optional[typing.Sequence[str]] = None, + extra_args: typing.Sequence[str] | None = None, timeout_minutes: int = 20, ) -> subprocess.Popen: if check_emulator_running_using_avd_name(avd_name=avd_name): @@ -326,7 +327,7 @@ def stop_emulator_by_pid(emulator_pid: int, timeout_seconds: int = 120): _log.info("Emulator stopped successfully.") -def stop_emulator(emulator_proc_or_pid: typing.Union[subprocess.Popen, int], timeout_seconds: int = 120): +def stop_emulator(emulator_proc_or_pid: subprocess.Popen | int, timeout_seconds: int = 120): """ Stops the emulator process, checking its running status before and after stopping. :param emulator_proc_or_pid: The emulator process (subprocess.Popen) or PID (int). diff --git a/tools/python/util/file_utils.py b/tools/python/util/file_utils.py index 0373ac171144f..4036841cbfd34 100644 --- a/tools/python/util/file_utils.py +++ b/tools/python/util/file_utils.py @@ -1,12 +1,13 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +from __future__ import annotations import os import pathlib import typing -def path_match_suffix_ignore_case(path: typing.Union[pathlib.Path, str], suffix: str) -> bool: +def path_match_suffix_ignore_case(path: pathlib.Path | str, suffix: str) -> bool: """ Returns whether `path` ends in `suffix`, ignoring case. """ @@ -16,8 +17,8 @@ def path_match_suffix_ignore_case(path: typing.Union[pathlib.Path, str], suffix: def files_from_file_or_dir( - file_or_dir_path: typing.Union[pathlib.Path, str], predicate: typing.Callable[[pathlib.Path], bool] = lambda _: True -) -> typing.List[pathlib.Path]: + file_or_dir_path: pathlib.Path | str, predicate: typing.Callable[[pathlib.Path], bool] = lambda _: True +) -> list[pathlib.Path]: """ Gets the files in `file_or_dir_path` satisfying `predicate`. If `file_or_dir_path` is a file, the single file is considered. Otherwise, all files in the directory are diff --git a/tools/python/util/get_azcopy.py b/tools/python/util/get_azcopy.py index bfcf228a956eb..32ad367b2a010 100644 --- a/tools/python/util/get_azcopy.py +++ b/tools/python/util/get_azcopy.py @@ -1,5 +1,6 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +from __future__ import annotations import contextlib import logging diff --git a/tools/python/util/make_dynamic_shape_fixed.py b/tools/python/util/make_dynamic_shape_fixed.py index f4e09a8cc04a3..2dc89399a604c 100644 --- a/tools/python/util/make_dynamic_shape_fixed.py +++ b/tools/python/util/make_dynamic_shape_fixed.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +from __future__ import annotations import argparse import os diff --git a/tools/python/util/mobile_helpers/test/test_usability_checker.py b/tools/python/util/mobile_helpers/test/test_usability_checker.py index 2deacfc91dd1c..7fde729aa0053 100644 --- a/tools/python/util/mobile_helpers/test/test_usability_checker.py +++ b/tools/python/util/mobile_helpers/test/test_usability_checker.py @@ -1,5 +1,6 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +from __future__ import annotations import logging import pathlib diff --git a/tools/python/util/onnx_model_utils.py b/tools/python/util/onnx_model_utils.py index 1938a2411e11d..12fff27031e93 100644 --- a/tools/python/util/onnx_model_utils.py +++ b/tools/python/util/onnx_model_utils.py @@ -1,9 +1,9 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +from __future__ import annotations import logging import pathlib -from typing import Optional import onnx from onnx import version_converter @@ -62,8 +62,8 @@ def get_opsets_imported(model: onnx.ModelProto): def update_onnx_opset( model_path: pathlib.Path, opset: int, - out_path: Optional[pathlib.Path] = None, - logger: Optional[logging.Logger] = None, + out_path: pathlib.Path | None = None, + logger: logging.Logger | None = None, ): """ Helper to update the opset of a model using onnx version_converter. Target opset must be greater than current opset. diff --git a/tools/python/util/optimize_onnx_model.py b/tools/python/util/optimize_onnx_model.py index b7ebb54b9c8fa..c5459b2d9ff9a 100644 --- a/tools/python/util/optimize_onnx_model.py +++ b/tools/python/util/optimize_onnx_model.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +from __future__ import annotations import argparse import os diff --git a/tools/python/util/ort_format_model/operator_type_usage_processors.py b/tools/python/util/ort_format_model/operator_type_usage_processors.py index 598549c42b60a..53f7a34015060 100644 --- a/tools/python/util/ort_format_model/operator_type_usage_processors.py +++ b/tools/python/util/ort_format_model/operator_type_usage_processors.py @@ -1,8 +1,8 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +from __future__ import annotations import json -import typing from abc import ABC, abstractmethod import ort_flatbuffers_py.fbs as fbs @@ -65,9 +65,7 @@ def __init__(self, domain: str, optype: str): def process_node(self, node: fbs.Node, value_name_to_typeinfo: dict): pass - def is_typed_registration_needed( - self, type_in_registration: str, globally_allowed_types: typing.Optional[typing.Set[str]] - ): + def is_typed_registration_needed(self, type_in_registration: str, globally_allowed_types: set[str] | None): """ Given the string from a kernel registration, determine if the registration is required or not. :param type_in_registration: Type string from kernel registration @@ -113,8 +111,8 @@ def __init__( optype: str, inputs: [int] = [0], # noqa: B006 outputs: [int] = [], # noqa: B006 - required_input_types: typing.Dict[int, typing.Set[str]] = {}, # noqa: B006 - required_output_types: typing.Dict[int, typing.Set[str]] = {}, # noqa: B006 + required_input_types: dict[int, set[str]] = {}, # noqa: B006 + required_output_types: dict[int, set[str]] = {}, # noqa: B006 ): """ Create DefaultTypeUsageProcessor. Types for one or more inputs and/or outputs can be tracked by the processor. @@ -186,9 +184,7 @@ def process_node(self, node: fbs.Node, value_name_to_typeinfo: dict): type_str = value_name_to_typestr(node.Outputs(o), value_name_to_typeinfo) self._output_types[o].add(type_str) - def is_typed_registration_needed( - self, type_in_registration: str, globally_allowed_types: typing.Optional[typing.Set[str]] - ): + def is_typed_registration_needed(self, type_in_registration: str, globally_allowed_types: set[str] | None): if 0 not in self._input_types: # currently all standard typed registrations are for input 0. # custom registrations can be handled by operator specific processors (e.g. OneHotProcessor below). @@ -262,9 +258,7 @@ def __init__(self, domain: str, optype: str): # init with tracking of input 1 only. super().__init__(domain, optype, inputs=[1], outputs=[]) - def is_typed_registration_needed( - self, type_in_registration: str, globally_allowed_types: typing.Optional[typing.Set[str]] - ): + def is_typed_registration_needed(self, type_in_registration: str, globally_allowed_types: set[str] | None): return self.is_input_type_enabled(type_in_registration, 1, globally_allowed_types) @@ -277,9 +271,7 @@ def __init__(self, domain: str, optype: str): # init with tracking of output 0 only. super().__init__(domain, optype, inputs=[], outputs=[0]) - def is_typed_registration_needed( - self, type_in_registration: str, globally_allowed_types: typing.Optional[typing.Set[str]] - ): + def is_typed_registration_needed(self, type_in_registration: str, globally_allowed_types: set[str] | None): return self.is_output_type_enabled(type_in_registration, 0, globally_allowed_types) @@ -301,9 +293,7 @@ def process_node(self, node: fbs.Node, value_name_to_typeinfo: dict): key = (type0, type2, type1) self._triples.add(key) - def is_typed_registration_needed( - self, type_in_registration: str, globally_allowed_types: typing.Optional[typing.Set[str]] - ): + def is_typed_registration_needed(self, type_in_registration: str, globally_allowed_types: set[str] | None): # the OneHot registration involves a concatenation of the 3 types involved reg_types = tuple([_reg_type_to_cpp_type(reg_type) for reg_type in _split_reg_types(type_in_registration)]) if globally_allowed_types is not None: @@ -633,7 +623,7 @@ class GloballyAllowedTypesOpTypeImplFilter(OpTypeImplFilterInterface): _valid_allowed_types = set(FbsTypeInfo.tensordatatype_to_string.values()) # noqa: RUF012 - def __init__(self, globally_allowed_types: typing.Set[str]): + def __init__(self, globally_allowed_types: set[str]): self._operator_processors = _create_operator_type_usage_processors() if not globally_allowed_types.issubset(self._valid_allowed_types): diff --git a/tools/python/util/reduced_build_config_parser.py b/tools/python/util/reduced_build_config_parser.py index be39562e2d60d..0afcca2388f10 100644 --- a/tools/python/util/reduced_build_config_parser.py +++ b/tools/python/util/reduced_build_config_parser.py @@ -1,5 +1,6 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +from __future__ import annotations import os diff --git a/tools/python/util/run.py b/tools/python/util/run.py index 838db8f789eac..b1ebd044f3420 100644 --- a/tools/python/util/run.py +++ b/tools/python/util/run.py @@ -1,5 +1,6 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +from __future__ import annotations import logging import os