Reading Parquet Files from a Java Application

Recently I came accross the requirement to read a parquet file into a java application and I figured out it is neither well documented nor easy to do so. As a consequence I wrote a short tutorial. The first task is to add your maven dependencies.

<dependencies>
 <dependency>
 <groupId>org.apache.parquet</groupId>
 <artifactId>parquet-hadoop</artifactId>
 <version>1.9.0</version>
 </dependency>
 <dependency>
 <groupId>org.apache.hadoop</groupId>
 <artifactId>hadoop-common</artifactId>
 <version>2.7.0</version>
 </dependency>
</dependencies>

To write the java application is easy once you know how to do it. Instead of using the AvroParquetReader or the ParquetReader class that you find frequently when searching for a solution to read parquet files use the class ParquetFileReader instead. The basic setup is to read all row groups and then read all groups recursively.

package de.jofre.test;
import java.io.IOException;
 
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.column.page.PageReadStore;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.simple.convert.GroupRecordConverter;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.io.ColumnIOFactory;
import org.apache.parquet.io.MessageColumnIO;
import org.apache.parquet.io.RecordReader;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.Type;
 
public class Main {
 
  private static Path path = new Path("file:\\C:\\myfile.snappy.parquet");
 
  private static void printGroup(Group g) {
    int fieldCount = g.getType().getFieldCount();
    for (int field = 0; field &lt; fieldCount; field++) {
      int valueCount = g.getFieldRepetitionCount(field);
 
      Type fieldType = g.getType().getType(field);
      String fieldName = fieldType.getName();
 
      for (int index = 0; index &lt; valueCount; index++) {
        if (fieldType.isPrimitive()) {
          System.out.println(fieldName + " " + g.getValueToString(field, index));
        }
      }
    }
    System.out.println("");
  }
 
  public static void main(String[] args) throws IllegalArgumentException {
 
    Configuration conf = new Configuration();
 
    try {
      ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER);
      MessageType schema = readFooter.getFileMetaData().getSchema();
      ParquetFileReader r = new ParquetFileReader(conf, path, readFooter);
 
      PageReadStore pages = null;
      try {
        while (null != (pages = r.readNextRowGroup())) {
          final long rows = pages.getRowCount();
          System.out.println("Number of rows: " + rows);
 
          final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
          final RecordReader recordReader = columnIO.getRecordReader(pages, new GroupRecordConverter(schema));
          for (int i = 0; i &lt; rows; i++) {
            final Group g = recordReader.read();
            printGroup(g);
 
            // TODO Compare to System.out.println(g);
          }
        }
      } finally {
        r.close();
      }
    } catch (IOException e) {
      System.out.println("Error reading parquet file.");
      e.printStackTrace();
    }
  }
}

Leave a Reply

Your email address will not be published. Required fields are marked *

*