Skip to content

Commit

Permalink
Merge pull request #5 from CosmologicalEmulators/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
marcobonici authored Aug 18, 2023
2 parents 06a5fe6 + 77708e3 commit 831f052
Show file tree
Hide file tree
Showing 7 changed files with 272 additions and 91 deletions.
5 changes: 4 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
name = "EmulatorsTrainer"
uuid = "ef107171-9c1d-4869-9860-ac18fb1a7e36"
authors = ["Marco Bonici <[email protected]>"]
version = "0.1.0"
version = "0.2.0"

[deps]
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
NPZ = "15e1cf62-19b3-5cfa-8e77-841668bca605"
QuasiMonteCarlo = "8a4e6c94-4038-4cdc-81c3-7e6ffdb2a71b"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# EmulatorsTrainer.jl

This repository contains code used to perform the training of the CosmologicalEmulators organization surrogate models.

## Roadmap to v1.0.0
Expand Down
114 changes: 114 additions & 0 deletions scripts/camb_script.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
using Distributed
using NPZ
using ClusterManagers
using EmulatorsTrainer
using JSON3
using Random


#mkdir("/home/mbonici/test_camb")
addprocs_lsf(100; bsub_flags=`-q medium -n 2 -M 6094`)#this because I am using a lsf cluster. Use the appropriate one!
@everywhere begin
using NPZ, EmulatorsTrainer, JSON3, Random, PyCall
camb = pyimport("camb")
pars = ["ln10As", "ns", "H0", "ombh2", "omch2", "tau"]
lb = [2.5, 0.88, 40., 0.1933, 0.08, 0.02]
ub = [3.5, 1.05, 100., 0.2533, 0.2, 0.12]


n = 1000
s = EmulatorsTrainer.create_training_dataset(n, lb, ub)

root_dir = "/home/mbonici/test_camb"#this is tuned to my dir, use the right one for you!

function camb_script(CosmoDict, root_path)
rand_str = root_path*"/"*randstring(10)
mkdir(rand_str)
lmax = 10000

# set_for_lmax
lens_potential_accuracy = 8
lens_margin = 2050

# set_matter_power
kmax = 10
k_per_logint = 130
nonlinear = true
accurate_massive_neutrinos = true

# set_accuracy
AccuracyBoost = 2
lSampleBoost = 2
lAccuracyBoost = 2
DoLateRadTruncation = false

pars = camb.CAMBparams()
pars.set_cosmology(H0 = round(CosmoDict["H0"]; sigdigits=6),
ombh2= round(CosmoDict["ombh2"]; sigdigits=6),
omch2= round(CosmoDict["omch2"]; sigdigits=6),
mnu =0.06,
omk =0.,
tau = round(CosmoDict["tau"]; sigdigits=6),
Alens=1.,
neutrino_hierarchy="normal",
num_massive_neutrinos=1,
)

pars.InitPower.set_params(As=round(exp(CosmoDict["ln10As"])*10^(-10); sigdigits=6),
ns=round(CosmoDict["ns"]; sigdigits=6),
r=0)
pars.set_for_lmax(lmax,
lens_potential_accuracy=lens_potential_accuracy)

pars.set_dark_energy(w=-1., wa=0., dark_energy_model="fluid")
pars.WantTransfer = 1
pars.NonLinear = camb.model.NonLinear_both # Non-linear matter power & lensing, HMcode
pars.Transfer.kmax = kmax
pars.Transfer.k_per_logint = k_per_logint
pars.Transfer.accurate_massive_neutrinos = accurate_massive_neutrinos


pars.set_accuracy(AccuracyBoost=AccuracyBoost,
lSampleBoost=lSampleBoost,
lAccuracyBoost=lAccuracyBoost,
DoLateRadTruncation=DoLateRadTruncation)

results = camb.get_results(pars)

powers = results.get_cmb_power_spectra(pars, CMB_unit="muK")

totCL=powers["total"]
unlensedCL=powers["unlensed_scalar"]
lens_potential=powers["lens_potential"]




#ll = np.arange(totCL.shape[0])
#len_l, _ = size(totCL)
#ll = Array(0:len_l)
clTT = totCL[:,1]
clEE = totCL[:,2]
clBB = totCL[:,3]# @fbianchini? is this right?
clTE = totCL[:,4]
clPP = lens_potential[:,1]

npzwrite(rand_str*"/cl_TT.npy", clTT)
npzwrite(rand_str*"/cl_EE.npy", clEE)
npzwrite(rand_str*"/cl_TE.npy", clTE)
npzwrite(rand_str*"/cl_BB.npy", clBB)
npzwrite(rand_str*"/cl_PP.npy", clPP)

open(rand_str*"/capse_dict.json", "w") do io
JSON3.write(io, CosmoDict)
end

end

end

EmulatorsTrainer.compute_dataset(s, pars, root_dir, camb_script)

for i in workers()
rmprocs(i)
end
41 changes: 41 additions & 0 deletions scripts/test_script.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
using Distributed
using NPZ
using ClusterManagers
using EmulatorsTrainer
using JSON3
using Random

addprocs_lsf(10)#this because I am using a lsf cluster. Use the appropriate one!

@everywhere begin
using NPZ, EmulatorsTrainer, JSON3, Random

pars = ["a", "b", "c"]
lb = [0., 0., 0.]
ub = [1., 1., 1.]
n = 1000
s = EmulatorsTrainer.create_training_dataset(n, lb, ub)

root_dir = "/home/mbonici/test_emu"#this is tuned to my dir, use the right one for you!

function test_script(my_dict::Dict, root_path)
rand_str = root_path*"/"*randstring(10)
mkdir(rand_str)
sleep(10)
a = my_dict["a"]
b = my_dict["b"]
c = my_dict["c"]
x = Array(LinRange(0,10,100))
y = a .* x .^ 2 .+ b .* x .+ c
npzwrite(rand_str*"/result.npy", y)
open(rand_str*"/dict.json", "w") do io
JSON3.write(io, my_dict)
end
end
end

EmulatorsTrainer.compute_dataset(s, pars, root_dir, test_script)

for i in workers()
rmprocs(i)
end
95 changes: 5 additions & 90 deletions src/EmulatorsTrainer.jl
Original file line number Diff line number Diff line change
@@ -1,99 +1,14 @@
module EmulatorsTrainer

using DataFrames
using Distributions
using Distributed
using JSON3
using NPZ
using QuasiMonteCarlo
using Random

function add_observable_df!(df::DataFrames.DataFrame, location::String, param_file::String,
observable_file::String, first_idx::Int, last_idx::Int, get_tuple::Function)
json_string = read(location*param_file, String)
cosmo_pars = JSON3.read(json_string)

observable = npzread(location*observable_file, "r")[first_idx:last_idx]
observable_filtered = get_tuple(cosmo_pars, observable)
push!(df, observable_filtered)
return nothing
end

function load_df_directory!(df::DataFrames.DataFrame, Directory,
add_observable_df!::Function)
for (root, dirs, files) in walkdir(Directory)
for file in files
file_extension = file[findlast(isequal('.'),file):end]
if file_extension == ".json"
add_observable_df!(df, root)
end
end
end
end

function extract_input_output_df(df, n_input_features::Int, n_output_features::Int)
array_df = Matrix(df)#convert df to matrix
#TODO check wether this can be done WITHOUT n_input and n_output
array_input = transpose(Array(array_df[:,1:n_input_features]))#6 as the number of input features
array_output = zeros(n_output_features, length(array_input[1,:]))#2499 as the number of output features
for element in 1:length(array_input[1,:])
array_output[:,element] = array_df[element,n_input_features+1]
end
return array_input, array_output
end

function get_minmax_in(df, array_pars_in)
#TODO check wether this can work on array_input
in_MinMax = zeros(length(array_pars_in),2)
for (idx, key) in enumerate(array_pars_in)
in_MinMax[idx,1] = minimum(df[!,key])
in_MinMax[idx,2] = maximum(df[!,key])
end
return in_MinMax
end

function get_minmax_out(array_out, n_output_features)
out_MinMax = zeros(n_output_features,2)

#this 3 is related to the number of selected features
for i in 1:n_output_features
out_MinMax[i, 1] = minimum(array_out[i,:])
out_MinMax[i, 2] = maximum(array_out[i,:])
end
return out_MinMax
end

function maximin_df!(df, in_MinMax, out_MinMax)
n_input_features, _ = size(in_MinMax)
for i in 1:n_input_features
df[!,i] .-= in_MinMax[i,1]
df[!,i] ./= (in_MinMax[i,2]-in_MinMax[i,1])
end
for i in 1:nrow(df)
df[!,"observable"][i] .-= out_MinMax[:,1]
df[!,"observable"][i] ./= (out_MinMax[:,2]-out_MinMax[:,1])
end
end

function splitdf(df, pct)
@assert 0 <= pct <= 1
ids = collect(axes(df, 1))
shuffle!(ids)
sel = ids .<= nrow(df) .* pct
return view(df, sel, :), view(df, .!sel, :)
end

function traintest_split(df, test)
te, tr = splitdf(df, test)
return tr, te
end

function getdata(df, n_input_features::Int, n_output_features::Int)
ENV["DATADEPS_ALWAYS_ACCEPT"] = "true"

train_df, test_df = traintest_split(df, 0.2)

xtrain, ytrain = extract_input_output_df(train_df, n_input_features, n_output_features)
xtest, ytest = extract_input_output_df(test_df, n_input_features, n_output_features)

return Float64.(xtrain), Float64.(ytrain), Float64.(xtest), Float64.(ytest)
end
include("trainer.jl")
include("farmer.jl")

end # module EmulatorsTrainer
17 changes: 17 additions & 0 deletions src/farmer.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
function create_training_dataset(n::Int, lb::Array, ub::Array)
return QuasiMonteCarlo.sample(n, lb, ub, LatinHypercubeSample())
end

function create_training_dict(training_matrix::Matrix, idx_comb::Int, params::Array{String})
return Dict([(value, training_matrix[idx_par, idx_comb])
for (idx_par, value) in enumerate(params)])
end

function compute_dataset(training_matrix::Matrix, params::Array{String}, root_dir::String, script_func::Function)
n_pars, n_combs = size(training_matrix)
mkdir(root_dir)
@sync @distributed for idx in 1:n_combs
train_dict = create_training_dict(training_matrix, idx, params)
script_func(train_dict, root_dir)
end
end
90 changes: 90 additions & 0 deletions src/trainer.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
function add_observable_df!(df::DataFrames.DataFrame, location::String, param_file::String,
observable_file::String, first_idx::Int, last_idx::Int, get_tuple::Function)
json_string = read(location*param_file, String)
cosmo_pars = JSON3.read(json_string)

observable = npzread(location*observable_file, "r")[first_idx:last_idx]
observable_filtered = get_tuple(cosmo_pars, observable)
push!(df, observable_filtered)
return nothing
end

function load_df_directory!(df::DataFrames.DataFrame, Directory,
add_observable_df!::Function)
for (root, dirs, files) in walkdir(Directory)
for file in files
file_extension = file[findlast(isequal('.'),file):end]
if file_extension == ".json"
add_observable_df!(df, root)
end
end
end
end

function extract_input_output_df(df, n_input_features::Int, n_output_features::Int)
array_df = Matrix(df)#convert df to matrix
#TODO check wether this can be done WITHOUT n_input and n_output
array_input = transpose(Array(array_df[:,1:n_input_features]))#6 as the number of input features
array_output = zeros(n_output_features, length(array_input[1,:]))#2499 as the number of output features
for element in 1:length(array_input[1,:])
array_output[:,element] = array_df[element,n_input_features+1]
end
return array_input, array_output
end

function get_minmax_in(df, array_pars_in)
#TODO check wether this can work on array_input
in_MinMax = zeros(length(array_pars_in),2)
for (idx, key) in enumerate(array_pars_in)
in_MinMax[idx,1] = minimum(df[!,key])
in_MinMax[idx,2] = maximum(df[!,key])
end
return in_MinMax
end

function get_minmax_out(array_out, n_output_features)
out_MinMax = zeros(n_output_features,2)

#this 3 is related to the number of selected features
for i in 1:n_output_features
out_MinMax[i, 1] = minimum(array_out[i,:])
out_MinMax[i, 2] = maximum(array_out[i,:])
end
return out_MinMax
end

function maximin_df!(df, in_MinMax, out_MinMax)
n_input_features, _ = size(in_MinMax)
for i in 1:n_input_features
df[!,i] .-= in_MinMax[i,1]
df[!,i] ./= (in_MinMax[i,2]-in_MinMax[i,1])
end
for i in 1:nrow(df)
df[!,"observable"][i] .-= out_MinMax[:,1]
df[!,"observable"][i] ./= (out_MinMax[:,2]-out_MinMax[:,1])
end
end

function splitdf(df, pct)
@assert 0 <= pct <= 1
ids = collect(axes(df, 1))
shuffle!(ids)
sel = ids .<= nrow(df) .* pct
return view(df, sel, :), view(df, .!sel, :)
end

function traintest_split(df, test)
te, tr = splitdf(df, test)
return tr, te
end

function getdata(df, n_input_features::Int, n_output_features::Int)
ENV["DATADEPS_ALWAYS_ACCEPT"] = "true"

train_df, test_df = traintest_split(df, 0.2)

xtrain, ytrain = extract_input_output_df(train_df, n_input_features, n_output_features)
xtest, ytest = extract_input_output_df(test_df, n_input_features, n_output_features)

return Float64.(xtrain), Float64.(ytrain), Float64.(xtest), Float64.(ytest)
end

0 comments on commit 831f052

Please sign in to comment.