Skip to content

Commit

Permalink
Parquet header parser
Browse files Browse the repository at this point in the history
  • Loading branch information
platypii committed Jan 5, 2024
1 parent 04fa052 commit 92902c4
Show file tree
Hide file tree
Showing 4 changed files with 119 additions and 0 deletions.
3 changes: 3 additions & 0 deletions src/encoding.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ import { ParquetEncoding, ParquetType } from './constants.js'
import { readVarInt } from './thrift.js'

/**
* Return type with bytes read.
* This is useful to advance an offset through a buffer.
*
* @typedef {import("./types.d.ts").Decoded<T>} Decoded
* @template T
*/
Expand Down
75 changes: 75 additions & 0 deletions src/header.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import { deserializeTCompactProtocol } from './thrift.js'

/**
* Return type with bytes read.
* This is useful to advance an offset through a buffer.
*
* @typedef {import("./types.d.ts").Decoded<T>} Decoded
* @template T
*/

/**
* Read parquet header from a buffer.
*
* @typedef {import("./types.d.ts").PageHeader} PageHeader
* @param {ArrayBuffer} arrayBuffer parquet file contents
* @param {number} offset offset to start reading from
* @returns {Decoded<PageHeader>} metadata object and bytes read
*/
export function parquetHeader(arrayBuffer, offset) {
// DataView for easier manipulation of the buffer
const view = new DataView(arrayBuffer)

const headerBuffer = view.buffer.slice(offset)
const { value: header, byteLength } = deserializeTCompactProtocol(headerBuffer)

// Parse parquet header from thrift data
const type = header.field_1
const uncompressed_page_size = header.field_2
const compressed_page_size = header.field_3
const crc = header.field_4
const data_page_header = header.field_5 && {
num_values: header.field_5.field_1,
encoding: header.field_5.field_2,
definition_level_encoding: header.field_5.field_3,
repetition_level_encoding: header.field_5.field_4,
statistics: header.field_5.field_5 && {
max: header.field_5.field_5.field_1,
min: header.field_5.field_5.field_2,
null_count: header.field_5.field_5.field_3,
distinct_count: header.field_5.field_5.field_4,
max_value: header.field_5.field_5.field_5,
min_value: header.field_5.field_5.field_6,
},
}
const index_page_header = header.field_6
const dictionary_page_header = header.field_7 && {
num_values: header.field_7.field_1,
encoding: header.field_7.field_2,
is_sorted: header.field_7.field_3,
}
const data_page_header_v2 = header.field_8 && {
num_values: header.field_8.field_1,
num_nulls: header.field_8.field_2,
num_rows: header.field_8.field_3,
encoding: header.field_8.field_4,
definition_levels_byte_length: header.field_8.field_5,
repetition_levels_byte_length: header.field_8.field_6,
is_compressed: header.field_8.field_7,
statistics: header.field_8.field_8,
}

return {
byteLength,
value: {
type,
uncompressed_page_size,
compressed_page_size,
crc,
data_page_header,
index_page_header,
dictionary_page_header,
data_page_header_v2,
},
}
}
39 changes: 39 additions & 0 deletions src/types.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -142,3 +142,42 @@ interface SortingColumn {
descending: boolean
nulls_first: boolean
}

// Parquet file header types
export interface PageHeader {
type: PageType
uncompressed_page_size: number
compressed_page_size: number
crc?: number
data_page_header?: DataPageHeader
index_page_header?: IndexPageHeader
dictionary_page_header?: DictionaryPageHeader
data_page_header_v2?: DataPageHeaderV2
}

export interface DataPageHeader {
num_values: number
encoding: Encoding
definition_level_encoding: Encoding
repetition_level_encoding: Encoding
statistics?: Statistics
}

interface IndexPageHeader {}

export interface DictionaryPageHeader {
num_values: number
encoding: Encoding
is_sorted?: boolean
}

interface DataPageHeaderV2 {
num_values: number
num_nulls: number
num_rows: number
encoding: Encoding
definition_levels_byte_length: number
repetition_levels_byte_length: number
is_compressed?: boolean
statistics?: Statistics
}
2 changes: 2 additions & 0 deletions test/encoding.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,5 @@ describe('readPlain', () => {
expect(() => readPlain(dataView, invalidType, 1, 0)).toThrow(`Unhandled type: ${invalidType}`)
})
})

// TODO: Add tests for readData

0 comments on commit 92902c4

Please sign in to comment.