You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I frequently deal with fairly large datasets (some high-dimensional loops and a buffered get-parameter). Loading these datasets from the database into a usable format (i.e., xarray) takes an extremely long time.
By contrast, loading this dataset from the database (~4GB on disk) takes 60 seconds! During the course of this minute, memory usage temporarily spikes by 8GB (Windows 10). The runtime breaks down as follows (some functions cherry-picked for detail, %lprun -u 1e-3 -f ... xds = qcds.to_xarray_dataset(), timings in ms):
to_xarray_dataset()
Total time: 57.4706 s
File: ...\qcodes\dataset\data_set.py
Function: to_xarray_dataset at line 986
Line # Hits Time Per Hit % Time Line Contents
1042 2 28081.6 14040.8 48.9 data = self.get_parameter_data(*params,
1043 1 0.0 0.0 0.0 start=start,
1044 1 0.0 0.0 0.0 end=end)
1045
1046 1 29389.0 29389.0 51.1 return load_to_xarray_dataset(self, data, use_multi_index=use_multi_index)
export_to_xarray()
Total time: 29.3664 s
File: ...\qcodes\dataset\exporters\export_to_xarray.py
Function: _load_to_xarray_dataarray_dict_no_metadata at line 62
Line # Hits Time Per Hit % Time Line Contents
68 1 0.0 0.0 0.0 import pandas as pd
69 1 0.0 0.0 0.0 import xarray as xr
70
71 1 0.0 0.0 0.0 if use_multi_index not in ("auto", "always", "never"):
72 raise ValueError(
73 f"Invalid value for use_multi_index. Expected one of 'auto', 'always', 'never' but got {use_multi_index}"
74 )
75
76 1 0.0 0.0 0.0 data_xrdarray_dict: dict[str, xr.DataArray] = {}
77
78 6 0.0 0.0 0.0 for name, subdict in datadict.items():
79 5 8157.3 1631.5 27.8 index = _generate_pandas_index(subdict)
80
81 5 0.0 0.0 0.0 if index is None:
82 xrdarray: xr.DataArray = (
83 _data_to_dataframe(subdict, index=index)
84 .to_xarray()
85 .get(name, xr.DataArray())
86 )
87 data_xrdarray_dict[name] = xrdarray
88 else:
89 5 15452.9 3090.6 52.6 index_unique = len(index.unique()) == len(index)
90
91 5 248.4 49.7 0.8 df = _data_to_dataframe(subdict, index)
92
93 5 0.0 0.0 0.0 if not index_unique:
94 # index is not unique so we fallback to using a counter as index
95 # and store the index as a variable
96 xrdata_temp = df.reset_index().to_xarray()
97 for _name in subdict:
98 data_xrdarray_dict[_name] = xrdata_temp[_name]
99 else:
100 5 1982.9 396.6 6.8 calc_index = _calculate_index_shape(index)
101 5 0.0 0.0 0.0 index_prod = prod(calc_index.values())
102 # if the product of the len of individual index dims == len(total_index)
103 # we are on a grid
104
105 5 0.0 0.0 0.0 on_grid = index_prod == len(index)
106
107 10 0.0 0.0 0.0 export_with_multi_index = (
108 10 0.0 0.0 0.0 not on_grid
109 5 0.0 0.0 0.0 and dataset.description.shapes is None
110 and use_multi_index == "auto"
111 5 0.0 0.0 0.0 ) or use_multi_index == "always"
112
113 5 0.0 0.0 0.0 if export_with_multi_index:
114 assert isinstance(df.index, pd.MultiIndex)
115
116 if hasattr(xr, "Coordinates"):
117 coords = xr.Coordinates.from_pandas_multiindex(
118 df.index, "multi_index"
119 )
120 xrdarray = xr.DataArray(df[name], coords=coords)
121 else:
122 # support xarray < 2023.8.0, can be removed when we drop support for that
123 xrdarray = xr.DataArray(df[name], [("multi_index", df.index)])
124 else:
125 5 3524.9 705.0 12.0 xrdarray = df.to_xarray().get(name, xr.DataArray())
126
127 5 0.0 0.0 0.0 data_xrdarray_dict[name] = xrdarray
128
129 1 0.0 0.0 0.0 return data_xrdarray_dict
_expand_data_to_arrays()
Total time: 4.06941 s
File: ...\qcodes\dataset\sqlite\queries.py
Function: _expand_data_to_arrays at line 266
Line # Hits Time Per Hit % Time Line Contents
266 def _expand_data_to_arrays(
267 data: list[tuple[Any, ...]], paramspecs: Sequence[ParamSpecBase]
268 ) -> None:
269 26 0.0 0.0 0.0 types = [param.type for param in paramspecs]
270 # if we have array type parameters expand all other parameters
271 # to arrays
272 5 0.0 0.0 0.0 if 'array' in types:
273
274 1 0.0 0.0 0.0 if ('numeric' in types or 'text' in types
275 or 'complex' in types):
276 1 0.0 0.0 0.0 first_array_element = types.index('array')
277 1 0.0 0.0 0.0 types_mapping: dict[int, Callable[[str], np.dtype[Any]]] = {}
278 6 0.0 0.0 0.0 for i, x in enumerate(types):
279 5 0.0 0.0 0.0 if x == "numeric":
280 3 0.0 0.0 0.0 types_mapping[i] = lambda _: np.dtype(np.float64)
281 2 0.0 0.0 0.0 elif x == "complex":
282 types_mapping[i] = lambda _: np.dtype(np.complex128)
283 2 0.0 0.0 0.0 elif x == "text":
284 types_mapping[i] = lambda array: np.dtype(f"U{len(array)}")
285
286 46253 30.0 0.0 0.7 for i_row, row in enumerate(data):
287 # todo should we handle int/float types here
288 # we would in practice have to perform another
289 # loop to check that all elements of a given can be cast to
290 # int without loosing precision before choosing an integer
291 # representation of the array
292 92504 3070.6 0.0 75.5 data[i_row] = tuple(
293 np.full_like(
294 row[first_array_element], array, dtype=types_mappingi
295 )
296 if i in types_mapping
297 else array
298 46252 25.8 0.0 0.6 for i, array in enumerate(row)
299 )
300
301 1 0.0 0.0 0.0 row_shape = None
302 46253 25.8 0.0 0.6 for i_row, row in enumerate(data):
303 # now expand all one element arrays to match the expected size
304 # one element arrays are introduced if scalar values are stored
305 # with an explicit array storage type
306 46252 16.1 0.0 0.4 max_size = 0
307 277512 146.8 0.0 3.6 for i, array in enumerate(row):
308 231260 110.2 0.0 2.7 if array.size > max_size:
309 46252 18.8 0.0 0.5 if max_size > 1:
310 log.warning(
311 f"Cannot expand array of size {max_size} "
312 f"to size {array.size}"
313 )
314 46252 23.4 0.0 0.6 max_size, row_shape = array.size, array.shape
315
316 46252 19.4 0.0 0.5 if max_size > 1:
317 46252 18.2 0.0 0.4 assert row_shape is not None
318 92504 549.2 0.0 13.5 data[i_row] = tuple(
319 np.full(row_shape, array, dtype=array.dtype)
320 if array.size == 1
321 else array
322 46252 15.1 0.0 0.4 for array in row
323 )
many_many()
Total time: 21.7225 s
File: ...\qcodes\dataset\sqlite\query_helpers.py
Function: many_many at line 117
Line # Hits Time Per Hit % Time Line Contents
117 def many_many(curr: sqlite3.Cursor, *columns: str) -> list[tuple[Any, ...]]:
118 """Get all values of many columns
119 Args:
120 curr: cursor to operate on
121 columns: names of the columns
122
123 Returns:
124 list of lists of values
125 """
126 5 21722.4 4344.5 100.0 res = curr.fetchall()
127
128 5 0.1 0.0 0.0 if _need_to_select(curr, *columns):
129 raise RuntimeError(
130 "Expected consistent selection: cursor has columns "
131 f"{tuple(c[0] for c in curr.description)} but expected {columns}"
132 )
133
134 5 0.0 0.0 0.0 return res
Unfortunately, I am not familiar with SQLite so I cannot say if there is much to be gained in the actual I/O from the database, but the discrepancy in speed to HDF5 makes me think that at least in the pure Python parts a lot could be gained. Loading a factor of 120 slower than xarray seems absurd.
The text was updated successfully, but these errors were encountered:
I frequently deal with fairly large datasets (some high-dimensional loops and a buffered get-parameter). Loading these datasets from the database into a usable format (i.e.,
xarray
) takes an extremely long time.A representative example:
For reference, the HDF5 I/O performance for this dataset:
By contrast, loading this dataset from the database (~4GB on disk) takes 60 seconds! During the course of this minute, memory usage temporarily spikes by 8GB (Windows 10). The runtime breaks down as follows (some functions cherry-picked for detail,
%lprun -u 1e-3 -f ... xds = qcds.to_xarray_dataset()
, timings in ms):to_xarray_dataset()
export_to_xarray()
_expand_data_to_arrays()
many_many()
Unfortunately, I am not familiar with SQLite so I cannot say if there is much to be gained in the actual I/O from the database, but the discrepancy in speed to HDF5 makes me think that at least in the pure Python parts a lot could be gained. Loading a factor of 120 slower than xarray seems absurd.
The text was updated successfully, but these errors were encountered: