qucontrol · goerz · Mar 24, 2020 · Mar 19, 2020 · Mar 21, 2020 · Mar 22, 2020
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -8,6 +8,10 @@ History
 
 * Added: Support for Python 3.8
 * Added: Support for QuTiP 4.5.0
+* Added: Support for parallelization with loky_ (`#72`_)
+* Added: ``krotov.parallelization.set_parallelization`` function
+* Added: `limit_thread_pool` option to ``krotov.optimize_pulses``
+* Changed: ``krotov.propagators.expm`` now guarantees single-threaded execution
 
 
 1.0.0 (2019-12-16)
@@ -94,6 +98,7 @@ History
 * Support for state-to-state and gate optimization, for both closed and open systems
 
 
+.. _loky: https://loky.readthedocs.io/
 .. _gh-pages: https://qucontrol.github.io/krotov
 .. _Doctr: https://drdoctr.github.io
 .. _@uiofgh: https://github.com/uiofgh
@@ -123,3 +128,4 @@ History
 .. _#66: https://github.com/qucontrol/krotov/issues/66
 .. _#67: https://github.com/qucontrol/krotov/issues/67
 .. _#68: https://github.com/qucontrol/krotov/issues/68
+.. _#72: https://github.com/qucontrol/krotov/issues/72
diff --git a/docs/10_howto.rst b/docs/10_howto.rst
@@ -535,6 +535,61 @@ Krotov's method is inherently parallel across different objectives. See
 :mod:`krotov.parallelization`, and the
 :ref:`/notebooks/05_example_transmon_xgate.ipynb` for an example.
 
+It is exceedingly important to ensure that you do not use any accidental nested
+parallelization. The :mod:`numpy` library is often eager to run in a
+multi-threaded mode that does not combine well with the process-based
+parallelization in :mod:`krotov.parallelization`. See
+:ref:`HowtoLimitThreadpool`.
+
+
+.. _HowtoLimitThreadpool:
+
+How to avoid over-subscribing the CPU when using parallelization
+----------------------------------------------------------------
+
+A common caveat of parallelization is that the number of numerically intensive
+threads or processes should not be larger than the number of CPUs on the
+machine. "Oversubscribing" the CPUs can make a parallelized program run slower
+by order of magnitudes compared to a serial program!
+
+One consequence of this realization is that *nested parallelizaton* must be
+tightly controlled: If your program used process-based parallelization (and
+assuming each process can tax a CPU core at 100%), then you must prevent
+multiple threads within each process. Depending on how they were compiled, some
+of Python's low-level numerical libraries (:mod:`numpy` in particular) are
+eager to run in a multi-threaded mode, and it can be surprisingly difficult to
+convince them not to do this. In general, you can
+`set environment variables to force low-level numerical code into single-threaded mode`_:
+
+.. code-block:: shell
+
+    export MKL_NUM_THREADS=1
+    export NUMEXPR_NUM_THREADS=1
+    export OMP_NUM_THREADS=1
+
+It may be a good idea to set these variables in your ``.bashrc`` (or the
+equivalent for whatever shell you are using), and only change their values when
+you specifically want to enable multi-threaded execution. You can sometimes set
+these variables inside a Python script or notebook, but you must do so before
+importing :mod:`numpy`.
+
+The threadpoolctl_ python package is another alternative of eliminating
+unexpected multi-threading. The functions in :mod:`krotov.parallelization` use
+this package internally to suppress low-level threads. For example, when using
+:func:`krotov.parallelization.parallel_map`, you can expected the execution to
+be limited to the given `num_cpus`. Also, :func:`.optimize_pulses` by
+defaults limits multi-threading, cf. the `limit_thread_pool` argument. Lastly,
+:func:`krotov.propagators.expm` ensures that the matrix exponentiation is
+calculated single-threadedly.
+
+Always monitor your processes in a tool like htop_ to watch out for unexpected
+CPU usage.
+
+.. _set environment variables to force low-level numerical code into single-threaded mode: https://stackoverflow.com/questions/30791550/limit-number-of-threads-in-numpy/31622299#31622299
+.. _threadpoolctl: https://github.com/joblib/threadpoolctl
+.. _htop: https://hisham.hm/htop/
+
+
 .. _HowtoStoreResult:
 
 How to prevent losing an optimization result

diff --git a/docs/conf.py b/docs/conf.py
@@ -101,6 +101,7 @@ def generate_patched_readme(_):
     "qutip": ("http://qutip.org/docs/latest/", None),
     "glom": ("https://glom.readthedocs.io/en/latest/", None),
     "weylchamber": ("https://weylchamber.readthedocs.io/en/latest/", None),
+    "loky": ("https://loky.readthedocs.io/en/stable/", None),
 }
 
 # Add any paths that contain templates here, relative to this directory.

diff --git a/docs/nbval_sanitize.cfg b/docs/nbval_sanitize.cfg
@@ -37,3 +37,8 @@ replace: \n
 # strip secs from info_hook table in 02_example_lambda_system_rwa_complex_pulse.ipynb
 regex: (?<=[\s\d]{5}[\s\d.e+-]{9}([\s\d.e+-]{12}){5}([\s\d.e+-]{11}){1}(        n/a){2})[\s\d]{6}\n
 replace: \n
+
+[regex10]
+# strip secs from info_hook table in 08_example_ensemble.ipynb
+regex: (?<=[\s\d]{5}[\s\d.e+-]{9}([\s\d.e+-]{11}){4}[\s\d.e+-]{12}[\s\d.e+-]{10}([n/a\s\d.e+-]{11}){2})[\s\d]{6}\n
+replace: \n
diff --git a/docs/notebooks/01_example_simple_state_to_state.ipynb b/docs/notebooks/01_example_simple_state_to_state.ipynb
diff --git a/docs/notebooks/02_example_lambda_system_rwa_complex_pulse.ipynb b/docs/notebooks/02_example_lambda_system_rwa_complex_pulse.ipynb
diff --git a/docs/notebooks/03_example_lambda_system_rwa_non_hermitian.ipynb b/docs/notebooks/03_example_lambda_system_rwa_non_hermitian.ipynb
diff --git a/docs/notebooks/04_example_dissipative_qubit_reset.ipynb b/docs/notebooks/04_example_dissipative_qubit_reset.ipynb
diff --git a/docs/notebooks/05_example_transmon_xgate.ipynb b/docs/notebooks/05_example_transmon_xgate.ipynb
diff --git a/docs/notebooks/06_example_3states.ipynb b/docs/notebooks/06_example_3states.ipynb
diff --git a/docs/notebooks/07_example_PE.ipynb b/docs/notebooks/07_example_PE.ipynb
diff --git a/docs/notebooks/08_example_ensemble.ipynb b/docs/notebooks/08_example_ensemble.ipynb
diff --git a/docs/notebooks/09_example_numpy.ipynb b/docs/notebooks/09_example_numpy.ipynb
diff --git a/setup.py b/setup.py
@@ -25,7 +25,16 @@ def get_version(filename):
     history = ''
 
 # requirements for use
-requirements = ['glom', 'numpy', 'scipy', 'qutip>=4.3.1', 'uniseg']
+requirements = [
+    'glom',
+    'numpy',
+    'scipy',
+    'qutip>=4.3.1',
+    'threadpoolctl',
+    'uniseg',
+]
+if sys.platform != 'linux':
+    requirements.append('loky')
 
 # requirements for development (testing, generating docs)
 dev_requirements = [
@@ -40,6 +49,7 @@ def get_version(filename):
     'gitpython',
     'isort',
     'jupyter',
+    'loky',
     'matplotlib',
     'nbsphinx',
     'nbval',

diff --git a/src/krotov/info_hooks.py b/src/krotov/info_hooks.py
@@ -223,9 +223,13 @@ def print_fidelity(**kwargs):
         "    ∫gₐ(t)dt: %s\n" % (", ".join(["%.2e" % v for v in g_a_integrals]))
     )
     out.write("    λₐ: %s\n" % (", ".join(["%.2e" % λ for λ in lambda_vals])))
-    MB_per_timeslot = sum(_qobj_nbytes(state) for state in fw_states_T) / (
-        1024 ** 2
-    )
+    try:
+        MB_per_timeslot = sum(_qobj_nbytes(state) for state in fw_states_T) / (
+            1024 ** 2
+        )
+    except AttributeError:
+        # e.g. fw_states_T = None (skip_initial_forward_propagation)
+        MB_per_timeslot = 0
     out.write("    storage (bw, fw, fw0): ")
     if backward_states is None:
         out.write("None, ")
@@ -266,10 +270,14 @@ def print_fidelity(**kwargs):
                 len(forward_states0[0]) * MB_per_timeslot,
             )
         )
-    out.write(
-        "    fw_states_T norm: %s\n"
-        % (", ".join(["%f" % state.norm() for state in fw_states_T]))
-    )
+    try:
+        out.write(
+            "    fw_states_T norm: %s\n"
+            % (", ".join(["%f" % state.norm() for state in fw_states_T]))
+        )
+    except AttributeError:
+        # e.g. fw_states_T = None (skip_initial_forward_propagation)
+        pass
     if not np.any(tau_vals == None):  # noqa
         out.write(
             "    τ: %s\n"

diff --git a/src/krotov/objectives.py b/src/krotov/objectives.py
@@ -577,20 +577,41 @@ def __str__(self):
     def __repr__(self):
         return "%s[%s]" % (self.__class__.__name__, str(self))
 
-    def __getstate__(self):
-        # Return data for the pickle serialization of an objective.
-        #
-        # This may not preserve time-dependent controls, and is only to enable
-        # the serialization of :class:`.Result` objects.
-        state = copy.copy(self.__dict__)
-        # Remove the unpicklable entries.
-        state['H'] = _remove_functions_from_nested_list(state['H'])
-        state['c_ops'] = _remove_functions_from_nested_list(state['c_ops'])
-        return state
+
+def _Objective_reduce_init(initial_state, H, target, c_ops):
+    # args-only version of Objective.__init__, for _Objective_reduce
+    return Objective(
+        initial_state=initial_state, H=H, target=target, c_ops=c_ops
+    )
+
+
+def _Objective_reduce(obj):
+    """Reduce :class:`Objective` for pickling.
+
+    This is a reduction function for customized pickling, see
+    :func:`copyreg.pickle`. It is used in :meth:`.Result.dump`.
+
+    In the standard-library-pickle, lambdas are not pickleable, so we replace
+    those non-pickleable entries with a placeholder.
+    """
+    return (
+        _Objective_reduce_init,
+        (
+            obj.initial_state,
+            _remove_functions_from_nested_list(obj.H),
+            obj.target,
+            _remove_functions_from_nested_list(obj.c_ops),
+        ),
+        {
+            k: v
+            for (k, v) in obj.__dict__.items()
+            if k not in obj._default_attribs
+        },
+    )
 
 
 class _ControlPlaceholder:
-    """Placeholder for a control function, for pickling"""
+    """Placeholder for a control function, for pickling."""
 
     def __init__(self, id):
         self.id = id

diff --git a/src/krotov/optimize.py b/src/krotov/optimize.py
@@ -5,6 +5,7 @@
 from functools import partial
 
 import numpy as np
+import threadpoolctl
 from qutip import Qobj
 from qutip.parallel import serial_map
 
@@ -19,6 +20,7 @@
 )
 from .info_hooks import chain
 from .mu import derivative_wrt_pulse
+from .parallelization import USE_THREADPOOL_LIMITS
 from .propagators import Propagator, expm
 from .result import Result
 from .second_order import _overlap
@@ -48,7 +50,8 @@ def optimize_pulses(
     continue_from=None,
     skip_initial_forward_propagation=False,
     norm=None,
-    overlap=None
+    overlap=None,
+    limit_thread_pool=None
 ):
     r"""Use Krotov's method to optimize towards the given `objectives`.
 
@@ -204,6 +207,14 @@ def optimize_pulses(
             :meth:`qutip.Qobj.overlap` for Hilbert space states and to the
             Hilbert-Schmidt norm $\tr[\rho_1^\dagger \rho2]$ for density
             matrices or operators.
+        limit_thread_pool (bool or None): If True, try to eliminate
+            multi-threading in low-level numerical routines like :mod:`numpy`,
+            via the use of the ``threadpoolctl`` package. Single-threaded
+            execution is usually faster, but if you know what you are doing and
+            can benchmark multi-threaded execution, you may set this to False
+            to place no restrictions on multi-threading. The default value
+            (None) delegates to
+            :obj:`krotov.parallelization.USE_THREADPOOL_LIMITS`.
 
     Returns:
         Result: The result of the optimization.
@@ -219,6 +230,12 @@ def optimize_pulses(
 
     # Initialization
     logger.info("Initializing optimization with Krotov's method")
+    thread_pool_limiter = None
+    if limit_thread_pool is None:
+        limit_thread_pool = USE_THREADPOOL_LIMITS
+    if limit_thread_pool:
+        logger.debug("Setting threadpoolctrl.threadpool_limits")
+        thread_pool_limiter = threadpoolctl.threadpool_limits(limits=1)
     if mu is None:
         mu = derivative_wrt_pulse
     second_order = sigma is not None
@@ -344,7 +361,7 @@ def optimize_pulses(
             iteration=0,
             info_vals=[],
             shared_data={},
-            **info_hook_static_args
+            **info_hook_static_args,
         )
 
     # Initialize Result object
@@ -506,7 +523,7 @@ def optimize_pulses(
                 info_vals=result.info_vals,
                 shared_data={},
                 iteration=krotov_iteration,
-                **info_hook_static_args
+                **info_hook_static_args,
             )
         # Update optimization `result` with info from finished iteration
         result.iters.append(krotov_iteration)
@@ -565,6 +582,9 @@ def optimize_pulses(
     result.end_local_time = time.localtime()
     for i, pulse in enumerate(optimized_pulses):
         result.optimized_controls[i] = pulse_onto_tlist(pulse)
+    if thread_pool_limiter is not None:
+        logger.debug("Unsetting threadpoolctrl.threadpool_limits")
+        thread_pool_limiter.unregister()
     return result