diff --git a/Project.toml b/Project.toml index 60161db8..5769720f 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "UnROOT" uuid = "3cd96dde-e98d-4713-81e9-a4a1b0235ce9" authors = ["Tamas Gal", "Jerry Ling", "Johannes Schumann", "Nick Amin"] -version = "0.9.2" +version = "0.10.0" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" diff --git a/docs/src/index.md b/docs/src/index.md index 3d57fb4b..ca9390bb 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -82,3 +82,81 @@ The laziness of the main interfaces are summarized below: | `@threads for X in ...`| 💤 | 💤 | | `getindex(tree, row::Int)`| 💤 | N/A | | `getindex(tree, row::Range)`| 🚨 | N/A | + +## Lazy tree construction + +As seen in the previous section, `LazyTree`s are cheap and offer a convenient +way to create an object that isolates the branches of interest. It's fairly +common that multiple branches are present with slightly differring names, like +`pos.x`, `pos.y` etc. The `LazyTree` function also takes regular expressions, as +seen in the example below where `r"Evt/trks/trks.pos.[xyz]"` is passed, that +will match the corresponding branches: + +```julia-repl +julia> f = UnROOT.samplefile("km3net_offline.root") +ROOTFile with 2 entries and 25 streamers. +/Users/tamasgal/Dev/UnROOT.jl/test/samples/km3net_offline.root +├─ E (TTree) +│ └─ "Evt" +└─ Header (Head) + +julia> t = LazyTree(f, "E", ["Evt/trks/trks.id", r"Evt/trks/trks.pos.[xyz]"]) + Row │ Evt_trks_trks_i Evt_trks_trks_p Evt_trks_trks_p Evt_trks_trks_p ⋯ + │ SubArray{Int32, SubArray{Float6 SubArray{Float6 SubArray{Float6 ⋯ +─────┼─────────────────────────────────────────────────────────────────────────── + 1 │ [1, 2, 3, 4, 5, [615.0, 615.0, [446.0, 446.0, [125.0, 125.0, 70.7, ⋯ + 2 │ [1, 2, 3, 4, 5, [533.0, 533.0, [465.0, 465.0, [80.7, 80.7, 39.1, 3 ⋯ + 3 │ [1, 2, 3, 4, 5, [593.0, 593.0, [457.0, 457.0, [194.0, 194.0, 96.5, ⋯ + 4 │ [1, 2, 3, 4, 5, [590.0, 590.0, [440.0, 440.0, [204.0, 204.0, 124.0 ⋯ + 5 │ [1, 2, 3, 4, 5, [546.0, 546.0, [440.0, 440.0, [58.6, 58.6, 30.1, 3 ⋯ + 6 │ [1, 2, 3, 4, 5, [585.0, 585.0, [424.0, 424.0, [202.0, 202.0, 183.0 ⋯ + 7 │ [1, 2, 3, 4, 5, [533.0, 533.0, [440.0, 440.0, [47.3, 47.3, 30.1, 2 ⋯ + 8 │ [1, 2, 3, 4, 5, [569.0, 569.0, [469.0, 469.0, [200.0, 200.0, 179.0 ⋯ + 9 │ [1, 2, 3, 4, 5, [557.0, 557.0, [412.0, 412.0, [209.0, 209.0, 101.0 ⋯ + 10 │ [1, 2, 3, 4, 5, [532.0, 532.0, [443.0, 443.0, [172.0, 172.0, 126.0 ⋯ + 1 column omitted + +julia> names(t) +4-element Vector{String}: + "Evt_trks_trks_id" + "Evt_trks_trks_pos_y" + "Evt_trks_trks_pos_x" + "Evt_trks_trks_pos_z" + +julia> t.Evt_trks_trks_pos_y +10-element LazyBranch{SubArray{Float64, 1, Vector{Float64}, Tuple{UnitRange{Int64}}, true}, +UnROOT.Nooffsetjagg, ArraysOfArrays.VectorOfVectors{Float64, Vector{Float64}, Vector{Int32}, +Vector{Tuple{}}}}: + [615.1089636184813, 615.1089636184813, … 574.836340445788, 576.5382993955498] + ... + ... +``` + +Branch names are normalised so that they contain valid characters for +identifiers. The branchname `Evt/trks/trks.pos.y` for example is therefore +converted to `Evt_trks_trks_posy`, which might be a bit inconvenient to use. +`LazyTree` can rename branches based on regular expressions and subsitution +strings (in Julia these are created with `s""`) which can be passed as `Pair`s. +The example below shows how to use this: + +```julia-repl +julia> t = LazyTree(f, "E", [r"Evt/trks/trks.(dir|pos).([xyz])" => s"\1_\2"]) + Row │ pos_z dir_z pos_y dir_y ⋯ + │ SubArray{Float6 SubArray{Float6 SubArray{Float6 SubArray{Float6 ⋯ +─────┼───────────────────────────────────────────────────────────────────────────────────── + 1 │ [125.0, 125.0, 70.7 [-0.873, -0.873, -0 [615.0, 615.0, 585. [-0.487, -0.487, -0 ⋯ + 2 │ [80.7, 80.7, 39.1, [-0.835, -0.835, -0 [533.0, 533.0, 559. [0.521, 0.521, 0.52 ⋯ + 3 │ [194.0, 194.0, 96.5 [-0.989, -0.989, -0 [593.0, 593.0, 581. [-0.122, -0.122, -0 ⋯ + 4 │ [204.0, 204.0, 124. [-0.968, -0.968, -0 [590.0, 590.0, 571. [-0.23, -0.23, -0.2 ⋯ + 5 │ [58.6, 58.6, 30.1, [-0.821, -0.821, -0 [546.0, 546.0, 565. [0.54, 0.54, 0.54, ⋯ + 2 columns omitted + +julia> names(t) +6-element Vector{String}: + "pos_z" + "dir_z" + "pos_y" + "dir_y" + "dir_x" + "pos_x" +``` diff --git a/src/iteration.jl b/src/iteration.jl index b937f637..e7ceb9ef 100644 --- a/src/iteration.jl +++ b/src/iteration.jl @@ -378,34 +378,54 @@ function LazyTree(f::ROOTFile, s::AbstractString, branches) error("$s is not the name of a TTree or a RNTuple.") end -function LazyTree(f::ROOTFile, tree::TTree, s, branches) +function normalize_branchname(s::AbstractString) + # split by `.` or `/` + norm_name = s + v = split(s, r"\.|\/") + if length(v) >= 2 # only normalize name when branches are split + head = v[1] + tail = v[2:end] + # remove duplicate info (only consecutive occurences) + idx = 1 + for e ∈ tail + e != head && break + idx += 1 + end + elements = tail[idx:end] + # remove known split branch information + filter!(e -> e != "fCoordinates", elements) + norm_name = join([head; elements], "_") + end + norm_name +end + +""" + function LazyTree(f::ROOTFile, tree::TTree, treepath, branches) + +Creates a lazy tree object of the selected branches only. `branches` is vector +of `String`, `Regex` or `Pair{Regex, SubstitutionString}`, where the first item +is the regex selector and the second item the rename pattern. + +""" +function LazyTree(f::ROOTFile, tree::TTree, treepath, branches) d = Dict{Symbol,LazyBranch}() _m(r::Regex) = Base.Fix1(occursin, r) all_bnames = getbranchnamesrecursive(tree) + # rename_map = Dict{Regex, SubstitutionString{String}}() res_bnames = mapreduce(∪, branches) do b if b isa Regex - filter(_m(b), all_bnames) + [_b => normalize_branchname(_b) for _b ∈ filter(_m(b), all_bnames)] + elseif b isa Pair{Regex, SubstitutionString{String}} + [_b => replace(_b, first(b) => last(b)) for _b ∈ filter(_m(first.(b)), all_bnames)] elseif b isa String expand = any(n->startswith(n, "$b/$b"), all_bnames) - expand ? filter(n->startswith(n, "$b/$b"), all_bnames) : [b] + expand ? [_b => normalize_branchname(_b) for _b ∈ filter(n->startswith(n, "$b/$b"), all_bnames)] : [b => normalize_branchname(b)] else error("branch selection must be string or regex") end end - for b in res_bnames - # split by `.` or `/` - norm_name = b - v = split(b, r"\.|\/") - if length(v) >= 2 # only normalize name when branches are split - head = v[1] - tail = v[2:end] - # remove duplicated info - replace!(tail, head => "") - # remove known split branch information - replace!(tail, "fCoordinates" => "") - norm_name = join([head; join(tail)], "_") - end - d[Symbol(norm_name)] = LazyBranch(f, "$s/$b") + for (b, norm_name) in res_bnames + d[Symbol(norm_name)] = LazyBranch(f, "$treepath/$b") end return LazyTree(NamedTuple{Tuple(keys(d))}(values(d))) end diff --git a/test/runtests.jl b/test/runtests.jl index 120e1cd1..ab2aae6b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -244,6 +244,19 @@ end @test sum(table.int32_array) == sum(row.int32_array for row in table) @test [row.int32_array for row in table] == BA close(rootfile) + + rootfile = UnROOT.samplefile("km3net_offline.root") + t = LazyTree(rootfile, "E", ["Evt/trks/trks.id", r"Evt/trks/trks.(dir|pos).([xyz])" => s"\1_\2"]) + @test 10 == length(t.Evt_trks_trks_id) + @test 10 == length(t.dir_x) + @test 10 == length(t.dir_y) + @test 10 == length(t.dir_z) + @test 10 == length(t.pos_x) + @test 10 == length(t.pos_y) + @test 10 == length(t.pos_z) + @test 56 == length(t.pos_z[1]) + @test 68.42717410489223 ≈ t.pos_z[1][5] + close(rootfile) end @testset "TLorentzVector" begin