Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add base file records' in-memory size to FileStats #140

Merged
merged 2 commits into from
Sep 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion crates/core/src/file_group/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,15 @@ impl FileSlice {
.get_parquet_file_metadata(&self.base_file_relative_path())
.await?;
let num_records = parquet_meta.file_metadata().num_rows();
let stats = FileStats { num_records };
let size_bytes = parquet_meta
.row_groups()
.iter()
.map(|rg| rg.total_byte_size())
.sum::<i64>();
let stats = FileStats {
num_records,
size_bytes,
};
self.base_file.stats = Some(stats);
}
Ok(())
Expand Down
3 changes: 2 additions & 1 deletion crates/core/src/storage/file_stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
* under the License.
*/

#[derive(Clone, Debug, Default, Eq, PartialEq)]
#[derive(Clone, Debug, Default)]
pub struct FileStats {
pub num_records: i64,
pub size_bytes: i64,
}
1 change: 1 addition & 0 deletions python/hudi/_internal.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class HudiFileSlice:
base_file_name: str
base_file_size: int
num_records: int
size_bytes: int

def base_file_relative_path(self) -> str: ...

Expand Down
7 changes: 6 additions & 1 deletion python/src/internal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ pub struct HudiFileSlice {
base_file_size: usize,
#[pyo3(get)]
num_records: i64,
#[pyo3(get)]
size_bytes: i64,
}

#[cfg(not(tarpaulin))]
Expand All @@ -69,14 +71,17 @@ fn convert_file_slice(f: &FileSlice) -> HudiFileSlice {
let commit_time = f.base_file.commit_time.to_string();
let base_file_name = f.base_file.info.name.clone();
let base_file_size = f.base_file.info.size;
let num_records = f.base_file.stats.clone().unwrap_or_default().num_records;
let stats = f.base_file.stats.clone().unwrap_or_default();
let num_records = stats.num_records;
let size_bytes = stats.size_bytes;
HudiFileSlice {
file_group_id,
partition_path,
commit_time,
base_file_name,
base_file_size,
num_records,
size_bytes,
}
}

Expand Down
1 change: 1 addition & 0 deletions python/tests/test_table_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def test_sample_table(get_sample_table):
"20240402144910683",
}
assert all(f.num_records == 1 for f in file_slices)
assert all(f.size_bytes > 0 for f in file_slices)
file_slice_paths = [f.base_file_relative_path() for f in file_slices]
assert set(file_slice_paths) == {
"chennai/68d3c349-f621-4cd8-9e8b-c6dd8eb20d08-0_4-12-0_20240402123035233.parquet",
Expand Down
Loading