Skip to content

Commit

Permalink
Metadata test rowgroups.parquet
Browse files Browse the repository at this point in the history
  • Loading branch information
platypii committed Jan 11, 2024
1 parent f2a15bd commit 62632d9
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 0 deletions.
3 changes: 3 additions & 0 deletions src/metadata.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ export function parquetMetadata(arrayBuffer) {
if (metadataLength <= 0 || metadataLength > metadataLengthOffset) {
throw new Error('parquet file invalid metadata length')
}
if (metadataLength > view.byteLength - 8) {
throw new Error('parquet file metadata length exceeds file size')
}

const metadataOffset = metadataLengthOffset - metadataLength
const metadataBuffer = view.buffer.slice(metadataOffset, metadataLengthOffset)
Expand Down
Binary file added test/files/rowgroups.parquet
Binary file not shown.
99 changes: 99 additions & 0 deletions test/metadata.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ describe('parquetMetadata', () => {
const arrayBuffer = await readFileToArrayBuffer('test/files/addrtype-missing-value.parquet')
const result = parquetMetadata(arrayBuffer)

// Parquet v1 from DuckDB
const expectedMetadata = {
version: 1,
schema: [
Expand Down Expand Up @@ -61,6 +62,104 @@ describe('parquetMetadata', () => {
expect(casted).toEqual(expectedMetadata)
})

it('should correctly decode metadata from rowgroups.parquet', async () => {
const arrayBuffer = await readFileToArrayBuffer('test/files/rowgroups.parquet')
const result = parquetMetadata(arrayBuffer)

// Parquet v2 from pandas with 2 row groups
const expectedMetadata = {
version: 2,
schema: [
{
repetition_type: 0,
name: 'schema',
num_children: 1,
},
{
type: 2,
repetition_type: 1,
name: 'numbers',
},
],
num_rows: 15,
row_groups: [
{
columns: [
{
file_offset: 150,
file_path: undefined,
meta_data: {
codec: 1,
data_page_offset: 71,
dictionary_page_offset: 4,
encoding_stats: [
{ count: 1, encoding: 0, page_type: 2 },
{ count: 1, encoding: 8, page_type: 0 },
],
encodings: [0, 3, 8],
num_values: 10,
path_in_schema: ['numbers'],
statistics: {
max: '\n\x00\x00\x00\x00\x00\x00\x00',
min: '\x01\x00\x00\x00\x00\x00\x00\x00',
null_count: 0,
},
total_compressed_size: 146,
total_uncompressed_size: 172,
type: 2,
},
},
],
total_byte_size: 172,
num_rows: 10,
},
{
columns: [
{
file_offset: 368,
meta_data: {
codec: 1,
data_page_offset: 294,
dictionary_page_offset: 248,
encoding_stats: [
{ count: 1, encoding: 0, page_type: 2 },
{ count: 1, encoding: 8, page_type: 0 },
],
encodings: [0, 3, 8],
num_values: 5,
path_in_schema: ['numbers'],
statistics: {
max: '\x0F\x00\x00\x00\x00\x00\x00\x00',
min: '\x0B\x00\x00\x00\x00\x00\x00\x00',
null_count: 0,
},
total_compressed_size: 120,
total_uncompressed_size: 126,
type: 2,
},
},
],
total_byte_size: 126,
num_rows: 5,
},
],
key_value_metadata: [
{
key: 'pandas',
// value: json
},
{
key: 'ARROW:schema',
// value: base64
},
],
created_by: 'parquet-cpp-arrow version 14.0.2',
}

const casted = toJson(result)
expect(casted).containSubset(expectedMetadata)
})

it('should throw an error for a too short file', () => {
const arrayBuffer = new ArrayBuffer(0)
expect(() => parquetMetadata(arrayBuffer)).toThrow('parquet file is too short')
Expand Down

0 comments on commit 62632d9

Please sign in to comment.