Merge pull request #5 from CosmologicalEmulators/develop

Develop
CosmologicalEmulators · Aug 18, 2023 · 831f052 · 831f052
2 parents 06a5fe6 + 77708e3
commit 831f052
Show file tree

Hide file tree

Showing 7 changed files with 272 additions and 91 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,10 +1,13 @@
 name = "EmulatorsTrainer"
 uuid = "ef107171-9c1d-4869-9860-ac18fb1a7e36"
 authors = ["Marco Bonici <[email protected]>"]
-version = "0.1.0"
+version = "0.2.0"
 
 [deps]
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
+Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
 NPZ = "15e1cf62-19b3-5cfa-8e77-841668bca605"
+QuasiMonteCarlo = "8a4e6c94-4038-4cdc-81c3-7e6ffdb2a71b"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
diff --git a/README.md b/README.md
@@ -1,4 +1,5 @@
 # EmulatorsTrainer.jl
+
 This repository contains code used to perform the training of the CosmologicalEmulators organization surrogate models.
 
 ## Roadmap to v1.0.0

diff --git a/scripts/camb_script.jl b/scripts/camb_script.jl
@@ -0,0 +1,114 @@
+using Distributed
+using NPZ
+using ClusterManagers
+using EmulatorsTrainer
+using JSON3
+using Random
+
+
+#mkdir("/home/mbonici/test_camb")
+addprocs_lsf(100; bsub_flags=`-q medium -n 2 -M 6094`)#this because I am using a lsf cluster. Use the appropriate one!
+@everywhere begin
+    using NPZ, EmulatorsTrainer, JSON3, Random, PyCall
+    camb = pyimport("camb")
+    pars = ["ln10As", "ns", "H0", "ombh2", "omch2", "tau"]
+    lb = [2.5, 0.88, 40., 0.1933, 0.08, 0.02]
+    ub = [3.5, 1.05, 100., 0.2533, 0.2, 0.12]
+
+
+    n = 1000
+    s = EmulatorsTrainer.create_training_dataset(n, lb, ub)
+
+    root_dir = "/home/mbonici/test_camb"#this is tuned to my dir, use the right one for you!
+
+    function camb_script(CosmoDict, root_path)
+        rand_str = root_path*"/"*randstring(10)
+        mkdir(rand_str)
+        lmax = 10000
+
+        # set_for_lmax
+        lens_potential_accuracy = 8
+        lens_margin = 2050
+
+        # set_matter_power
+        kmax = 10
+        k_per_logint = 130
+        nonlinear = true
+        accurate_massive_neutrinos = true
+
+        # set_accuracy
+        AccuracyBoost = 2
+        lSampleBoost = 2
+        lAccuracyBoost = 2
+        DoLateRadTruncation = false
+
+        pars = camb.CAMBparams()
+        pars.set_cosmology(H0 = round(CosmoDict["H0"]; sigdigits=6),
+                        ombh2= round(CosmoDict["ombh2"]; sigdigits=6),
+                        omch2= round(CosmoDict["omch2"]; sigdigits=6),
+                        mnu  =0.06,
+                        omk  =0.,
+                        tau  = round(CosmoDict["tau"]; sigdigits=6),
+                        Alens=1.,
+                        neutrino_hierarchy="normal",
+                        num_massive_neutrinos=1,
+                        )
+
+        pars.InitPower.set_params(As=round(exp(CosmoDict["ln10As"])*10^(-10); sigdigits=6),
+                                ns=round(CosmoDict["ns"]; sigdigits=6),
+                                r=0)
+        pars.set_for_lmax(lmax,
+                        lens_potential_accuracy=lens_potential_accuracy)
+
+        pars.set_dark_energy(w=-1., wa=0., dark_energy_model="fluid")
+        pars.WantTransfer = 1
+        pars.NonLinear = camb.model.NonLinear_both # Non-linear matter power & lensing, HMcode
+        pars.Transfer.kmax = kmax
+        pars.Transfer.k_per_logint = k_per_logint
+        pars.Transfer.accurate_massive_neutrinos = accurate_massive_neutrinos
+
+
+        pars.set_accuracy(AccuracyBoost=AccuracyBoost,
+                        lSampleBoost=lSampleBoost,
+                        lAccuracyBoost=lAccuracyBoost,
+                        DoLateRadTruncation=DoLateRadTruncation)
+
+        results = camb.get_results(pars)
+
+        powers = results.get_cmb_power_spectra(pars, CMB_unit="muK")
+
+        totCL=powers["total"]
+        unlensedCL=powers["unlensed_scalar"]
+        lens_potential=powers["lens_potential"]
+
+
+
+
+        #ll = np.arange(totCL.shape[0])
+        #len_l, _ = size(totCL)
+        #ll = Array(0:len_l)
+        clTT = totCL[:,1]
+        clEE = totCL[:,2]
+        clBB = totCL[:,3]# @fbianchini? is this right?
+        clTE = totCL[:,4]
+        clPP = lens_potential[:,1]
+
+        npzwrite(rand_str*"/cl_TT.npy", clTT)
+        npzwrite(rand_str*"/cl_EE.npy", clEE)
+        npzwrite(rand_str*"/cl_TE.npy", clTE)
+        npzwrite(rand_str*"/cl_BB.npy", clBB)
+        npzwrite(rand_str*"/cl_PP.npy", clPP)
+
+        open(rand_str*"/capse_dict.json", "w") do io
+            JSON3.write(io, CosmoDict)
+        end
+
+    end
+
+end
+
+EmulatorsTrainer.compute_dataset(s, pars, root_dir, camb_script)
+
+for i in workers()
+    rmprocs(i)
+end
diff --git a/scripts/test_script.jl b/scripts/test_script.jl
@@ -0,0 +1,41 @@
+using Distributed
+using NPZ
+using ClusterManagers
+using EmulatorsTrainer
+using JSON3
+using Random
+
+addprocs_lsf(10)#this because I am using a lsf cluster. Use the appropriate one!
+
+@everywhere begin
+    using NPZ, EmulatorsTrainer, JSON3, Random
+
+    pars = ["a", "b", "c"]
+    lb = [0., 0., 0.]
+    ub = [1., 1., 1.]
+    n = 1000
+    s = EmulatorsTrainer.create_training_dataset(n, lb, ub)
+
+    root_dir = "/home/mbonici/test_emu"#this is tuned to my dir, use the right one for you!
+
+    function test_script(my_dict::Dict, root_path)
+        rand_str = root_path*"/"*randstring(10)
+        mkdir(rand_str)
+        sleep(10)
+        a = my_dict["a"]
+        b = my_dict["b"]
+        c = my_dict["c"]
+        x = Array(LinRange(0,10,100))
+        y = a .* x .^ 2 .+ b .* x .+ c
+        npzwrite(rand_str*"/result.npy", y)
+        open(rand_str*"/dict.json", "w") do io
+            JSON3.write(io, my_dict)
+        end
+    end
+end
+
+EmulatorsTrainer.compute_dataset(s, pars, root_dir, test_script)
+
+for i in workers()
+	rmprocs(i)
+end
diff --git a/src/EmulatorsTrainer.jl b/src/EmulatorsTrainer.jl
@@ -1,99 +1,14 @@
 module EmulatorsTrainer
 
 using DataFrames
+using Distributions
+using Distributed
 using JSON3
 using NPZ
+using QuasiMonteCarlo
 using Random
 
-function add_observable_df!(df::DataFrames.DataFrame, location::String, param_file::String,
-    observable_file::String, first_idx::Int, last_idx::Int, get_tuple::Function)
-    json_string = read(location*param_file, String)
-    cosmo_pars = JSON3.read(json_string)
-
-    observable = npzread(location*observable_file, "r")[first_idx:last_idx]
-    observable_filtered = get_tuple(cosmo_pars, observable)
-    push!(df, observable_filtered)
-    return nothing
-end
-
-function load_df_directory!(df::DataFrames.DataFrame, Directory,
-    add_observable_df!::Function)
-    for (root, dirs, files) in walkdir(Directory)
-        for file in files
-            file_extension = file[findlast(isequal('.'),file):end]
-            if file_extension == ".json"
-                add_observable_df!(df, root)
-            end
-        end
-    end
-end
-
-function extract_input_output_df(df, n_input_features::Int, n_output_features::Int)
-    array_df = Matrix(df)#convert df to matrix
-    #TODO check wether this can be done WITHOUT n_input and n_output
-    array_input = transpose(Array(array_df[:,1:n_input_features]))#6 as the number of input features
-    array_output = zeros(n_output_features, length(array_input[1,:]))#2499 as the number of output features
-    for element in 1:length(array_input[1,:])
-        array_output[:,element] = array_df[element,n_input_features+1]
-    end
-    return array_input, array_output
-end
-
-function get_minmax_in(df, array_pars_in)
-    #TODO check wether this can work on array_input
-    in_MinMax = zeros(length(array_pars_in),2)
-    for (idx, key) in enumerate(array_pars_in)
-        in_MinMax[idx,1] = minimum(df[!,key])
-        in_MinMax[idx,2] = maximum(df[!,key])
-    end
-    return in_MinMax
-end
-
-function get_minmax_out(array_out, n_output_features)
-    out_MinMax = zeros(n_output_features,2)
-
-    #this 3 is related to the number of selected features
-    for i in 1:n_output_features
-        out_MinMax[i, 1] = minimum(array_out[i,:])
-        out_MinMax[i, 2] = maximum(array_out[i,:])
-    end
-    return out_MinMax
-end
-
-function maximin_df!(df, in_MinMax, out_MinMax)
-    n_input_features, _ = size(in_MinMax)
-    for i in 1:n_input_features
-        df[!,i] .-= in_MinMax[i,1]
-        df[!,i] ./= (in_MinMax[i,2]-in_MinMax[i,1])
-    end
-    for i in 1:nrow(df)
-        df[!,"observable"][i] .-= out_MinMax[:,1]
-        df[!,"observable"][i] ./= (out_MinMax[:,2]-out_MinMax[:,1])
-    end
-end
-
-function splitdf(df, pct)
-    @assert 0 <= pct <= 1
-    ids = collect(axes(df, 1))
-    shuffle!(ids)
-    sel = ids .<= nrow(df) .* pct
-    return view(df, sel, :), view(df, .!sel, :)
-end
-
-function traintest_split(df, test)
-    te, tr = splitdf(df, test)
-    return tr, te
-end
-
-function getdata(df, n_input_features::Int, n_output_features::Int)
-    ENV["DATADEPS_ALWAYS_ACCEPT"] = "true"
-
-    train_df, test_df = traintest_split(df, 0.2)
-
-    xtrain, ytrain = extract_input_output_df(train_df, n_input_features, n_output_features)
-    xtest, ytest = extract_input_output_df(test_df, n_input_features, n_output_features)
-
-    return Float64.(xtrain), Float64.(ytrain), Float64.(xtest), Float64.(ytest)
-end
+include("trainer.jl")
+include("farmer.jl")
 
 end # module EmulatorsTrainer
diff --git a/src/farmer.jl b/src/farmer.jl
@@ -0,0 +1,17 @@
+function create_training_dataset(n::Int, lb::Array, ub::Array)
+    return QuasiMonteCarlo.sample(n, lb, ub, LatinHypercubeSample())
+end
+
+function create_training_dict(training_matrix::Matrix, idx_comb::Int, params::Array{String})
+    return Dict([(value, training_matrix[idx_par, idx_comb])
+    for (idx_par, value) in enumerate(params)])
+end
+
+function compute_dataset(training_matrix::Matrix, params::Array{String}, root_dir::String, script_func::Function)
+    n_pars, n_combs = size(training_matrix)
+    mkdir(root_dir)
+    @sync @distributed for idx in 1:n_combs
+        train_dict = create_training_dict(training_matrix, idx, params)
+        script_func(train_dict, root_dir)
+    end
+end
diff --git a/src/trainer.jl b/src/trainer.jl
@@ -0,0 +1,90 @@
+function add_observable_df!(df::DataFrames.DataFrame, location::String, param_file::String,
+    observable_file::String, first_idx::Int, last_idx::Int, get_tuple::Function)
+    json_string = read(location*param_file, String)
+    cosmo_pars = JSON3.read(json_string)
+
+    observable = npzread(location*observable_file, "r")[first_idx:last_idx]
+    observable_filtered = get_tuple(cosmo_pars, observable)
+    push!(df, observable_filtered)
+    return nothing
+end
+
+function load_df_directory!(df::DataFrames.DataFrame, Directory,
+    add_observable_df!::Function)
+    for (root, dirs, files) in walkdir(Directory)
+        for file in files
+            file_extension = file[findlast(isequal('.'),file):end]
+            if file_extension == ".json"
+                add_observable_df!(df, root)
+            end
+        end
+    end
+end
+
+function extract_input_output_df(df, n_input_features::Int, n_output_features::Int)
+    array_df = Matrix(df)#convert df to matrix
+    #TODO check wether this can be done WITHOUT n_input and n_output
+    array_input = transpose(Array(array_df[:,1:n_input_features]))#6 as the number of input features
+    array_output = zeros(n_output_features, length(array_input[1,:]))#2499 as the number of output features
+    for element in 1:length(array_input[1,:])
+        array_output[:,element] = array_df[element,n_input_features+1]
+    end
+    return array_input, array_output
+end
+
+function get_minmax_in(df, array_pars_in)
+    #TODO check wether this can work on array_input
+    in_MinMax = zeros(length(array_pars_in),2)
+    for (idx, key) in enumerate(array_pars_in)
+        in_MinMax[idx,1] = minimum(df[!,key])
+        in_MinMax[idx,2] = maximum(df[!,key])
+    end
+    return in_MinMax
+end
+
+function get_minmax_out(array_out, n_output_features)
+    out_MinMax = zeros(n_output_features,2)
+
+    #this 3 is related to the number of selected features
+    for i in 1:n_output_features
+        out_MinMax[i, 1] = minimum(array_out[i,:])
+        out_MinMax[i, 2] = maximum(array_out[i,:])
+    end
+    return out_MinMax
+end
+
+function maximin_df!(df, in_MinMax, out_MinMax)
+    n_input_features, _ = size(in_MinMax)
+    for i in 1:n_input_features
+        df[!,i] .-= in_MinMax[i,1]
+        df[!,i] ./= (in_MinMax[i,2]-in_MinMax[i,1])
+    end
+    for i in 1:nrow(df)
+        df[!,"observable"][i] .-= out_MinMax[:,1]
+        df[!,"observable"][i] ./= (out_MinMax[:,2]-out_MinMax[:,1])
+    end
+end
+
+function splitdf(df, pct)
+    @assert 0 <= pct <= 1
+    ids = collect(axes(df, 1))
+    shuffle!(ids)
+    sel = ids .<= nrow(df) .* pct
+    return view(df, sel, :), view(df, .!sel, :)
+end
+
+function traintest_split(df, test)
+    te, tr = splitdf(df, test)
+    return tr, te
+end
+
+function getdata(df, n_input_features::Int, n_output_features::Int)
+    ENV["DATADEPS_ALWAYS_ACCEPT"] = "true"
+
+    train_df, test_df = traintest_split(df, 0.2)
+
+    xtrain, ytrain = extract_input_output_df(train_df, n_input_features, n_output_features)
+    xtest, ytest = extract_input_output_df(test_df, n_input_features, n_output_features)
+
+    return Float64.(xtrain), Float64.(ytrain), Float64.(xtest), Float64.(ytest)
+end