Update README

hyparam · Jan 9, 2024 · 58aed8d · 58aed8d
1 parent 271cc72
commit 58aed8d
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -12,18 +12,48 @@ Apache Parquet is an open source, column-oriented data file format designed for
 
 Dependency free since 2023!
 
-## Usage
+## Features
+
+- Designed to work with huge ML datasets (things like [starcoder](https://huggingface.co/datasets/bigcode/starcoderdata))
+- Loads metadata separately from data
+- Data can be filtered by row and column ranges
+- Only fetches the data needed
+- Fast data loading for large scale ML applications
+- Bring data visualization closer to the user, in the browser
+
+## Installation
 
 ```bash
 npm install hyparquet
 ```
 
+## Usage
+
+If you're in a node.js environment, you can load a parquet file with the following example:
+
+```js
+const { parquetMetadata } = await import('hyparquet')
+const fs = await import('fs')
+
+const buffer = fs.readFileSync('example.parquet')
+const arrayBuffer = buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)
+const metadata = parquetMetadata(arrayBuffer)
+```
+
+If you're in a browser environment, you'll probably get parquet file data from either a drag-and-dropped file from the user, or downloaded from the web.
+
+To load parquet data in the browser from a remote server using `fetch`:
+
 ```js
 import { parquetMetadata } from 'hyparquet'
 
-const metadata = parquetMetdata(arrayBuffer)
+const res = await fetch(url)
+const arrayBuffer = await res.arrayBuffer()
+const metadata = parquetMetadata(arrayBuffer)
 ```
 
+To parse parquet files from a user drag-and-drop action, see example in [index.html](index.html).
+
 ## References
 
  - https://github.com/apache/parquet-format

diff --git a/src/encoding.js b/src/encoding.js
@@ -308,7 +308,8 @@ function readRle(dataView, offset, header, bitWidth) {
  * @returns {Decoded<number[]>} array of bit-packed values
  */
 function readBitPacked(dataView, offset, header, bitWidth, remaining) {
-  let count = (header >> 1) * 8
+  // extract number of values to read from header
+  let count = (header >> 1) << 3
   const mask = maskForBits(bitWidth)
 
   let data = dataView.getUint8(offset)
@@ -318,20 +319,24 @@ function readBitPacked(dataView, offset, header, bitWidth, remaining) {
   /** @type {number[]} */
   const value = []
 
+  // read values
   while (count) {
+    // if we have crossed a byte boundary, shift the data
     if (right > 8) {
       right -= 8
       left -= 8
       data >>= 8
     } else if (left - right < bitWidth) {
-      // read next byte
-      data |= (dataView.getUint8(offset + byteLength) << left)
+      // if we don't have bitWidth number of bits to read, read next byte
+      data |= dataView.getUint8(offset + byteLength) << left
       byteLength++
       left += 8
     } else {
-      // don't write more than num rows
+      // otherwise, read bitWidth number of bits
+      // don't write more than remaining number of rows
+      // even if there are still bits to read
       if (remaining > 0) {
-        // emit value
+        // emit value by shifting off to the right and masking
         value.push((data >> right) & mask)
         remaining--
       }
@@ -340,6 +345,7 @@ function readBitPacked(dataView, offset, header, bitWidth, remaining) {
     }
   }
 
+  // return values and number of bytes read
   return { value, byteLength }
 }
 

diff --git a/test/encoding.test.js b/test/encoding.test.js
@@ -26,7 +26,7 @@ describe('readPlain', () => {
   })
 
   it('reads INT96 values correctly', () => {
-    const buffer = new ArrayBuffer(12) // 12 bytes for a single INT96 value
+    const buffer = new ArrayBuffer(12)
     const dataView = new DataView(buffer)
 
     // Example INT96 value split into 64-bit low part and 32-bit high part