Skip to content

Commit

Permalink
Parquet schema utils
Browse files Browse the repository at this point in the history
  • Loading branch information
platypii committed Jan 7, 2024
1 parent 37e981f commit efdaf2c
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 1 deletion.
2 changes: 1 addition & 1 deletion src/hyparquet.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ export function parquetRead(arrayBuffer: ArrayBuffer): any[][]
* @param {ArrayBuffer} arrayBuffer parquet file contents
* @returns {FileMetaData} metadata object
*/
export function parquetMetadata(arrayBuffer: ArrayBuffer): any
export function parquetMetadata(arrayBuffer: ArrayBuffer): FileMetaData

/**
* Decompress snappy data.
Expand Down
113 changes: 113 additions & 0 deletions src/schema.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import { FieldRepetitionType } from './types.js'

/**
* @typedef {import('./types.js').SchemaElement} SchemaElement
* @typedef {{ element: SchemaElement, children: SchemaTree[], endIndex: number }} SchemaTree
*/

/**
* Build a tree from the schema elements.
*
* @param {SchemaElement[]} schema
* @param {number} i index of the root element
* @returns {SchemaTree} tree of schema elements
*/
function schemaTree(schema, i) {
const root = schema[i]
const children = []
i++

// Read the specified number of children
if (root.num_children) {
while (children.length < root.num_children) {
const child = schemaTree(schema, i)
i = child.endIndex
children.push(child)
}
}

return { endIndex: i, element: root, children }
}

/**
* Get the schema element with the given name.
*
* @param {SchemaElement[]} schema
* @param {string[]} name path to the element
* @returns {SchemaElement} schema element
*/
export function schemaElement(schema, name) {
let tree = schemaTree(schema, 0)
// traverse the tree to find the element
for (const part of name) {
const child = tree.children.find(child => child.element.name === part)
if (!child) {
throw new Error(`schema element not found: ${name}`)
}
tree = child
}
return tree.element
}

/**
* Check if the schema element with the given name is required.
*
* @param {SchemaElement[]} schema
* @param {string[]} name path to the element
* @returns {boolean} true if the element is required
*/
export function isRequired(schema, name) {
return schemaElement(schema, name).repetition_type === FieldRepetitionType.REQUIRED
}

/**
* Get the max repetition level for a given schema path.
*
* @param {SchemaElement[]} schema
* @param {string[]} parts path to the element
* @returns {number} max repetition level
*/
export function getMaxRepetitionLevel(schema, parts) {
let maxLevel = 0
parts.forEach((part, i) => {
const element = schemaElement(schema, parts.slice(0, i + 1))
if (element.repetition_type === FieldRepetitionType.REPEATED) {
maxLevel += 1
}
})
return maxLevel
}

/**
* Get the max definition level for a given schema path.
*
* @param {SchemaElement[]} schema
* @param {string[]} parts path to the element
* @returns {number} max definition level
*/
export function getMaxDefinitionLevel(schema, parts) {
let maxLevel = 0
parts.forEach((part, i) => {
const element = schemaElement(schema, parts.slice(0, i + 1))
if (element.repetition_type !== FieldRepetitionType.REQUIRED) {
maxLevel += 1
}
})
return maxLevel
}

/**
* Get the number of bytes to skip for definition levels.
*
* @param {number} num number of values
* @returns {number} number of bytes to skip
*/
export function skipDefinitionBytes(num) {
let byteLength = 6
let n = num >>> 8
while (n !== 0) {
byteLength += 1
n >>>= 7
}
return byteLength
}

0 comments on commit efdaf2c

Please sign in to comment.