Reading Parquet Files from a Java Application

Recently I came accross the requirement to read a parquet file into a java application and I figured out it is neither well documented nor easy to do so. As a consequence I wrote a short tutorial. The first task is to add your maven dependencies.

<dependencies>
 <dependency>
 <groupId>org.apache.parquet</groupId>
 <artifactId>parquet-hadoop</artifactId>
 <version>1.9.0</version>
 </dependency>
 <dependency>
 <groupId>org.apache.hadoop</groupId>
 <artifactId>hadoop-common</artifactId>
 <version>2.7.0</version>
 </dependency>
</dependencies>

To write the java application is easy once you know how to do it. Instead of using the AvroParquetReader or the ParquetReader class that you find frequently when searching for a solution to read parquet files use the class ParquetFileReader instead. The basic setup is to read all row groups and then read all groups recursively.

package de.jofre.test;
import java.io.IOException;
 
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.column.page.PageReadStore;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.simple.convert.GroupRecordConverter;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.io.ColumnIOFactory;
import org.apache.parquet.io.MessageColumnIO;
import org.apache.parquet.io.RecordReader;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.Type;
 
public class Main {
 
  private static Path path = new Path("file:\\C:\\myfile.snappy.parquet");
 
  private static void printGroup(Group g) {
    int fieldCount = g.getType().getFieldCount();
    for (int field = 0; field &lt; fieldCount; field++) {
      int valueCount = g.getFieldRepetitionCount(field);
 
      Type fieldType = g.getType().getType(field);
      String fieldName = fieldType.getName();
 
      for (int index = 0; index &lt; valueCount; index++) {
        if (fieldType.isPrimitive()) {
          System.out.println(fieldName + " " + g.getValueToString(field, index));
        }
      }
    }
    System.out.println("");
  }
 
  public static void main(String[] args) throws IllegalArgumentException {
 
    Configuration conf = new Configuration();
 
    try {
      ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER);
      MessageType schema = readFooter.getFileMetaData().getSchema();
      ParquetFileReader r = new ParquetFileReader(conf, path, readFooter);
 
      PageReadStore pages = null;
      try {
        while (null != (pages = r.readNextRowGroup())) {
          final long rows = pages.getRowCount();
          System.out.println("Number of rows: " + rows);
 
          final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
          final RecordReader recordReader = columnIO.getRecordReader(pages, new GroupRecordConverter(schema));
          for (int i = 0; i &lt; rows; i++) {
            final Group g = recordReader.read();
            printGroup(g);
 
            // TODO Compare to System.out.println(g);
          }
        }
      } finally {
        r.close();
      }
    } catch (IOException e) {
      System.out.println("Error reading parquet file.");
      e.printStackTrace();
    }
  }
}

package de.jofre.test; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.parquet.column.page.PageReadStore; import org.apache.parquet.example.data.Group; import org.apache.parquet.example.data.simple.convert.GroupRecordConverter; import org.apache.parquet.format.converter.ParquetMetadataConverter; import org.apache.parquet.hadoop.ParquetFileReader; import org.apache.parquet.hadoop.metadata.ParquetMetadata; import org.apache.parquet.io.ColumnIOFactory; import org.apache.parquet.io.MessageColumnIO; import org.apache.parquet.io.RecordReader; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.Type; public class Main { private static Path path = new Path("file:\\C:\\myfile.snappy.parquet"); private static void printGroup(Group g) { int fieldCount = g.getType().getFieldCount(); for (int field = 0; field < fieldCount; field++) { int valueCount = g.getFieldRepetitionCount(field); Type fieldType = g.getType().getType(field); String fieldName = fieldType.getName(); for (int index = 0; index < valueCount; index++) { if (fieldType.isPrimitive()) { System.out.println(fieldName + " " + g.getValueToString(field, index)); } } } System.out.println(""); } public static void main(String[] args) throws IllegalArgumentException { Configuration conf = new Configuration(); try { ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER); MessageType schema = readFooter.getFileMetaData().getSchema(); ParquetFileReader r = new ParquetFileReader(conf, path, readFooter); PageReadStore pages = null; try { while (null != (pages = r.readNextRowGroup())) { final long rows = pages.getRowCount(); System.out.println("Number of rows: " + rows); final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema); final RecordReader recordReader = columnIO.getRecordReader(pages, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) { final Group g = recordReader.read(); printGroup(g); // TODO Compare to System.out.println(g); } } } finally { r.close(); } } catch (IOException e) { System.out.println("Error reading parquet file."); e.printStackTrace(); } } }

10 thoughts on “Reading Parquet Files from a Java Application”

MATTIA CASOTTO says:

June 18, 2018 at 8:27 am

Hello, thanks for the tutorial! 🙂 Just want to let you notice that the < is displayed as < , hence copying and pasting the code will not work immediately.

Ref: https://stackoverflow.com/questions/981852/why-is-showing-as-lt

1. Dave James says:
  
  January 16, 2019 at 10:25 am
  
  Nice one – I’ve been tied up in knots trying to extract data from a parquet for ages. I got this working in a few minutes 🙂
  
  Just a couple of small points.
  
  1. As ‘MATTIA CASOTTO’ says, the ‘less then’ symbol is displayed as a HTML escape sequence.
  2. recordReader.read() returns an object – you’ll need to cast it to a Group
  
  It’s not a big deal, but the code won’t work as-is
  
Justin says:

August 29, 2018 at 7:39 pm

This was helpful. Thanks very much. The output here from recordRecorder.read() has an indented structure if there are arrays or objects. Instead of indented, could it use a JSON formatter? Thanks

Chris says:

November 6, 2018 at 3:34 pm

I was redirected here from your post at stackoverflow. Your code works for me. Thanks.
If I want to read multiple parquet files from a folder, do you have some ideas about this problem?
Appreciate your help.

Eric says:

February 4, 2019 at 8:49 pm

NICE, had to set javax.net.ssl.keyStore to a keystore I created using KMS cert since we are using CDH, but it worked! Now to convert this to CSV…

Sinan Erdem says:

April 2, 2019 at 12:37 pm

Thank you for the information. One thing I noticed is you are checking if a fieldType is Primitive. If not you are discarding it. Is there a reason behind this? How can we get (possibly in JSON format) values that are not primitive?

Amar says:

May 21, 2019 at 10:18 am

Can you please provide code to write groups into parquet file

Saksham Malhotra says:

June 2, 2019 at 9:57 am

I am getting the below error when trying to use the above code
java.lang.NoSuchMethodError: org.jets3t.service.impl.rest.httpclient.RestS3Service.(Lorg/jets3t/service/security/AWSCredentials;)

CODE:
String PATH_SCHEMA = “s3://” + object.getBucketName() + “/” + object.getKey();
Path path = new Path(PATH_SCHEMA);
Configuration conf = new Configuration();
conf.set(“fs.s3.awsAccessKeyId”, credentials.accessKeyId);
conf.set(“fs.s3.awsSecretAccessKey”, credentials.secretKey);
try {
ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER);
MessageType schema = readFooter.getFileMetaData().getSchema();
ParquetFileReader r = new ParquetFileReader(conf, path, readFooter);
PageReadStore pages = null;
try {
while (null != (pages = r.readNextRowGroup())) {
final long rows = pages.getRowCount();
System.out.println(“Number of rows: ” + rows);
final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
final RecordReader recordReader = columnIO.getRecordReader(pages, new GroupRecordConverter(schema));
}
} finally {
r.close();
}
} catch (IOException e) {
System.out.println(“Error reading parquet file.”);
e.printStackTrace();
}

I am using the exact version of the dependencies mentioned above

Please help here.

1. padmalcom says:
  
  June 2, 2019 at 7:23 pm
  
  Hi, you are using some AWS S3 dependency and call a method that does not exist. It has nothing to do with my code.
  
Sharaz says:

April 7, 2023 at 6:59 am

Good one, easy to test parquet files using simple java code.

10 thoughts on “Reading Parquet Files from a Java Application”

Leave a Reply Cancel reply