Use threadpoolctl

qucontrol · Mar 24, 2020 · f44b4c4 · f44b4c4
1 parent e6c8f8b
commit f44b4c4
Show file tree

Hide file tree

Showing 14 changed files with 333 additions and 150 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -9,7 +9,9 @@ History
 * Added: Support for Python 3.8
 * Added: Support for QuTiP 4.5.0
 * Added: Support for parallelization with loky_ (`#72`_)
-* Added: ``krotov.parallelization.set_parallelization`` function.
+* Added: ``krotov.parallelization.set_parallelization`` function
+* Added: `limit_thread_pool` option to ``krotov.optimize_pulses``
+* Changed: ``krotov.propagators.expm`` now guarantees single-threaded execution
 
 
 1.0.0 (2019-12-16)

diff --git a/docs/10_howto.rst b/docs/10_howto.rst
@@ -535,6 +535,61 @@ Krotov's method is inherently parallel across different objectives. See
 :mod:`krotov.parallelization`, and the
 :ref:`/notebooks/05_example_transmon_xgate.ipynb` for an example.
 
+It is exceedingly important to ensure that you do not use any accidental nested
+parallelization. The :mod:`numpy` library is often eager to run in a
+multi-threaded mode that does not combine well with the process-based
+parallelization in :mod:`krotov.parallelization`. See
+:ref:`HowtoLimitThreadpool`.
+
+
+.. _HowtoLimitThreadpool:
+
+How to avoid over-subscribing the CPU when using parallelization
+----------------------------------------------------------------
+
+A common caveat of parallelization is that the number of numerically intensive
+threads or processes should not be larger than the number of CPUs on the
+machine. "Oversubscribing" the CPUs can make a parallelized program run slower
+by order of magnitudes compared to a serial program!
+
+One consequence of this realization is that *nested parallelizaton* must be
+tightly controlled: If your program used process-based parallelization (and
+assuming each process can tax a CPU core at 100%), then you must prevent
+multiple threads within each process. Depending on how they were compiled, some
+of Python's low-level numerical libraries (:mod:`numpy` in particular) are
+eager to run in a multi-threaded mode, and it can be surprisingly difficult to
+convince them not to do this. In general, you can
+`set environment variables to force low-level numerical code into single-threaded mode`_:
+
+.. code-block:: shell
+
+    export MKL_NUM_THREADS=1
+    export NUMEXPR_NUM_THREADS=1
+    export OMP_NUM_THREADS=1
+
+It may be a good idea to set these variables in your ``.bashrc`` (or the
+equivalent for whatever shell you are using), and only change their values when
+you specifically want to enable multi-threaded execution. You can sometimes set
+these variables inside a Python script or notebook, but you must do so before
+importing :mod:`numpy`.
+
+The threadpoolctl_ python package is another alternative of eliminating
+unexpected multi-threading. The functions in :mod:`krotov.parallelization` use
+this package internally to suppress low-level threads. For example, when using
+:func:`krotov.parallelization.parallel_map`, you can expected the execution to
+be limited to the given `num_cpus`. Also, :func:`.optimize_pulses` by
+defaults limits multi-threading, cf. the `limit_thread_pool` argument. Lastly,
+:func:`krotov.propagators.expm` ensures that the matrix exponentiation is
+calculated single-threadedly.
+
+Always monitor your processes in a tool like htop_ to watch out for unexpected
+CPU usage.
+
+.. _set environment variables to force low-level numerical code into single-threaded mode: https://stackoverflow.com/questions/30791550/limit-number-of-threads-in-numpy/31622299#31622299
+.. _threadpoolctl: https://github.com/joblib/threadpoolctl
+.. _htop: https://hisham.hm/htop/
+
+
 .. _HowtoStoreResult:
 
 How to prevent losing an optimization result

diff --git a/docs/nbval_sanitize.cfg b/docs/nbval_sanitize.cfg
@@ -37,3 +37,8 @@ replace: \n
 # strip secs from info_hook table in 02_example_lambda_system_rwa_complex_pulse.ipynb
 regex: (?<=[\s\d]{5}[\s\d.e+-]{9}([\s\d.e+-]{12}){5}([\s\d.e+-]{11}){1}(        n/a){2})[\s\d]{6}\n
 replace: \n
+
+[regex10]
+# strip secs from info_hook table in 08_example_ensemble.ipynb
+regex: (?<=[\s\d]{5}[\s\d.e+-]{9}([\s\d.e+-]{11}){4}[\s\d.e+-]{12}[\s\d.e+-]{10}([n/a\s\d.e+-]{11}){2})[\s\d]{6}\n
+replace: \n
diff --git a/docs/notebooks/01_example_simple_state_to_state.ipynb b/docs/notebooks/01_example_simple_state_to_state.ipynb
diff --git a/docs/notebooks/02_example_lambda_system_rwa_complex_pulse.ipynb b/docs/notebooks/02_example_lambda_system_rwa_complex_pulse.ipynb
diff --git a/docs/notebooks/03_example_lambda_system_rwa_non_hermitian.ipynb b/docs/notebooks/03_example_lambda_system_rwa_non_hermitian.ipynb
diff --git a/docs/notebooks/04_example_dissipative_qubit_reset.ipynb b/docs/notebooks/04_example_dissipative_qubit_reset.ipynb
diff --git a/docs/notebooks/05_example_transmon_xgate.ipynb b/docs/notebooks/05_example_transmon_xgate.ipynb
@@ -21,11 +21,11 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "qutip            4.4.1\n",
       "matplotlib.pylab 1.17.2\n",
+      "scipy            1.3.1\n",
       "numpy            1.17.2\n",
       "krotov           1.0.0+dev\n",
-      "qutip            4.4.1\n",
-      "scipy            1.3.1\n",
       "matplotlib       3.2.1\n",
       "CPython 3.7.6\n",
       "IPython 7.13.0\n"
@@ -548,11 +548,11 @@
      "text": [
       "iter.      J_T    g_a_int          J  Delta J_T    Delta J  secs\n",
       "0     1.00e+00   0.00e+00   1.00e+00        n/a        n/a     4\n",
-      "1     2.80e-01   3.41e-01   6.22e-01  -7.20e-01  -3.78e-01    11\n",
-      "2     2.12e-01   3.06e-02   2.43e-01  -6.81e-02  -3.75e-02    16\n",
-      "3     1.35e-01   3.28e-02   1.68e-01  -7.72e-02  -4.44e-02    11\n",
-      "4     9.79e-02   1.56e-02   1.13e-01  -3.71e-02  -2.15e-02    12\n",
-      "5     7.13e-02   1.11e-02   8.25e-02  -2.65e-02  -1.54e-02    14\n"
+      "1     2.80e-01   3.41e-01   6.22e-01  -7.20e-01  -3.78e-01    14\n",
+      "2     2.12e-01   3.06e-02   2.43e-01  -6.81e-02  -3.75e-02    14\n",
+      "3     1.35e-01   3.28e-02   1.68e-01  -7.72e-02  -4.44e-02    15\n",
+      "4     9.79e-02   1.56e-02   1.13e-01  -3.71e-02  -2.15e-02    16\n",
+      "5     7.13e-02   1.11e-02   8.25e-02  -2.65e-02  -1.54e-02    18\n"
      ]
     }
    ],

diff --git a/docs/notebooks/07_example_PE.ipynb b/docs/notebooks/07_example_PE.ipynb
diff --git a/docs/notebooks/09_example_numpy.ipynb b/docs/notebooks/09_example_numpy.ipynb
diff --git a/setup.py b/setup.py
@@ -25,7 +25,14 @@ def get_version(filename):
     history = ''
 
 # requirements for use
-requirements = ['glom', 'numpy', 'scipy', 'qutip>=4.3.1', 'uniseg']
+requirements = [
+    'glom',
+    'numpy',
+    'scipy',
+    'qutip>=4.3.1',
+    'threadpoolctl',
+    'uniseg',
+]
 if sys.platform != 'linux':
     requirements.append('loky')
 

diff --git a/src/krotov/optimize.py b/src/krotov/optimize.py
@@ -5,6 +5,7 @@
 from functools import partial
 
 import numpy as np
+import threadpoolctl
 from qutip import Qobj
 from qutip.parallel import serial_map
 
@@ -19,6 +20,7 @@
 )
 from .info_hooks import chain
 from .mu import derivative_wrt_pulse
+from .parallelization import USE_THREADPOOL_LIMITS
 from .propagators import Propagator, expm
 from .result import Result
 from .second_order import _overlap
@@ -48,7 +50,8 @@ def optimize_pulses(
     continue_from=None,
     skip_initial_forward_propagation=False,
     norm=None,
-    overlap=None
+    overlap=None,
+    limit_thread_pool=None
 ):
     r"""Use Krotov's method to optimize towards the given `objectives`.
 
@@ -204,6 +207,14 @@ def optimize_pulses(
             :meth:`qutip.Qobj.overlap` for Hilbert space states and to the
             Hilbert-Schmidt norm $\tr[\rho_1^\dagger \rho2]$ for density
             matrices or operators.
+        limit_thread_pool (bool or None): If True, try to eliminate
+            multi-threading in low-level numerical routines like :mod:`numpy`,
+            via the use of the ``threadpoolctl`` package. Single-threaded
+            execution is usually faster, but if you know what you are doing and
+            can benchmark multi-threaded execution, you may set this to False
+            to place no restrictions on multi-threading. The default value
+            (None) delegates to
+            :obj:`krotov.parallelization.USE_THREADPOOL_LIMITS`.
 
     Returns:
         Result: The result of the optimization.
@@ -219,6 +230,12 @@ def optimize_pulses(
 
     # Initialization
     logger.info("Initializing optimization with Krotov's method")
+    thread_pool_limiter = None
+    if limit_thread_pool is None:
+        limit_thread_pool = USE_THREADPOOL_LIMITS
+    if limit_thread_pool:
+        logger.debug("Setting threadpoolctrl.threadpool_limits")
+        thread_pool_limiter = threadpoolctl.threadpool_limits(limits=1)
     if mu is None:
         mu = derivative_wrt_pulse
     second_order = sigma is not None
@@ -344,7 +361,7 @@ def optimize_pulses(
             iteration=0,
             info_vals=[],
             shared_data={},
-            **info_hook_static_args
+            **info_hook_static_args,
         )
 
     # Initialize Result object
@@ -506,7 +523,7 @@ def optimize_pulses(
                 info_vals=result.info_vals,
                 shared_data={},
                 iteration=krotov_iteration,
-                **info_hook_static_args
+                **info_hook_static_args,
             )
         # Update optimization `result` with info from finished iteration
         result.iters.append(krotov_iteration)
@@ -565,6 +582,9 @@ def optimize_pulses(
     result.end_local_time = time.localtime()
     for i, pulse in enumerate(optimized_pulses):
         result.optimized_controls[i] = pulse_onto_tlist(pulse)
+    if thread_pool_limiter is not None:
+        logger.debug("Unsetting threadpoolctrl.threadpool_limits")
+        thread_pool_limiter.unregister()
     return result