Metadata test rowgroups.parquet

hyparam · Jan 11, 2024 · 62632d9 · 62632d9
1 parent f2a15bd
commit 62632d9
Show file tree

Hide file tree

Showing 3 changed files with 102 additions and 0 deletions.
diff --git a/src/metadata.js b/src/metadata.js
@@ -26,6 +26,9 @@ export function parquetMetadata(arrayBuffer) {
   if (metadataLength <= 0 || metadataLength > metadataLengthOffset) {
     throw new Error('parquet file invalid metadata length')
   }
+  if (metadataLength > view.byteLength - 8) {
+    throw new Error('parquet file metadata length exceeds file size')
+  }
 
   const metadataOffset = metadataLengthOffset - metadataLength
   const metadataBuffer = view.buffer.slice(metadataOffset, metadataLengthOffset)

diff --git a/test/files/rowgroups.parquet b/test/files/rowgroups.parquet
diff --git a/test/metadata.test.js b/test/metadata.test.js
@@ -19,6 +19,7 @@ describe('parquetMetadata', () => {
     const arrayBuffer = await readFileToArrayBuffer('test/files/addrtype-missing-value.parquet')
     const result = parquetMetadata(arrayBuffer)
 
+    // Parquet v1 from DuckDB
     const expectedMetadata = {
       version: 1,
       schema: [
@@ -61,6 +62,104 @@ describe('parquetMetadata', () => {
     expect(casted).toEqual(expectedMetadata)
   })
 
+  it('should correctly decode metadata from rowgroups.parquet', async () => {
+    const arrayBuffer = await readFileToArrayBuffer('test/files/rowgroups.parquet')
+    const result = parquetMetadata(arrayBuffer)
+
+    // Parquet v2 from pandas with 2 row groups
+    const expectedMetadata = {
+      version: 2,
+      schema: [
+        {
+          repetition_type: 0,
+          name: 'schema',
+          num_children: 1,
+        },
+        {
+          type: 2,
+          repetition_type: 1,
+          name: 'numbers',
+        },
+      ],
+      num_rows: 15,
+      row_groups: [
+        {
+          columns: [
+            {
+              file_offset: 150,
+              file_path: undefined,
+              meta_data: {
+                codec: 1,
+                data_page_offset: 71,
+                dictionary_page_offset: 4,
+                encoding_stats: [
+                  { count: 1, encoding: 0, page_type: 2 },
+                  { count: 1, encoding: 8, page_type: 0 },
+                ],
+                encodings: [0, 3, 8],
+                num_values: 10,
+                path_in_schema: ['numbers'],
+                statistics: {
+                  max: '\n\x00\x00\x00\x00\x00\x00\x00',
+                  min: '\x01\x00\x00\x00\x00\x00\x00\x00',
+                  null_count: 0,
+                },
+                total_compressed_size: 146,
+                total_uncompressed_size: 172,
+                type: 2,
+              },
+            },
+          ],
+          total_byte_size: 172,
+          num_rows: 10,
+        },
+        {
+          columns: [
+            {
+              file_offset: 368,
+              meta_data: {
+                codec: 1,
+                data_page_offset: 294,
+                dictionary_page_offset: 248,
+                encoding_stats: [
+                  { count: 1, encoding: 0, page_type: 2 },
+                  { count: 1, encoding: 8, page_type: 0 },
+                ],
+                encodings: [0, 3, 8],
+                num_values: 5,
+                path_in_schema: ['numbers'],
+                statistics: {
+                  max: '\x0F\x00\x00\x00\x00\x00\x00\x00',
+                  min: '\x0B\x00\x00\x00\x00\x00\x00\x00',
+                  null_count: 0,
+                },
+                total_compressed_size: 120,
+                total_uncompressed_size: 126,
+                type: 2,
+              },
+            },
+          ],
+          total_byte_size: 126,
+          num_rows: 5,
+        },
+      ],
+      key_value_metadata: [
+        {
+          key: 'pandas',
+          // value: json
+        },
+        {
+          key: 'ARROW:schema',
+          // value: base64
+        },
+      ],
+      created_by: 'parquet-cpp-arrow version 14.0.2',
+    }
+
+    const casted = toJson(result)
+    expect(casted).containSubset(expectedMetadata)
+  })
+
   it('should throw an error for a too short file', () => {
     const arrayBuffer = new ArrayBuffer(0)
     expect(() => parquetMetadata(arrayBuffer)).toThrow('parquet file is too short')