Support parquet file format. (#100)

stephenplusplus · web-flow · commit 8f198c592120 · 2018-05-03T14:03:01.000-04:00
diff --git a/handwritten/bigquery/samples/system-test/tables.test.js b/handwritten/bigquery/samples/system-test/tables.test.js
@@ -151,6 +151,26 @@ test.serial(`should extract a table to GCS`, async t => {
     .start();
 });
 
+test(`should load a GCS Parquet file with explicit schema`, async t => {
+  t.plan(1);
+  const tableId = generateUuid();
+
+  const output = await tools.runAsync(
+    `${cmd} load-gcs-parquet ${projectId} ${datasetId} ${tableId}`,
+    cwd
+  );
+  t.regex(output, /completed\./);
+  await tools
+    .tryTest(async assert => {
+      const [rows] = await bigquery
+        .dataset(datasetId)
+        .table(tableId)
+        .getRows();
+      assert(rows.length > 0);
+    })
+    .start();
+});
+
 test(`should load a GCS CSV file with explicit schema`, async t => {
   t.plan(1);
   const tableId = generateUuid();
diff --git a/handwritten/bigquery/samples/tables.js b/handwritten/bigquery/samples/tables.js
@@ -240,6 +240,73 @@ function loadLocalFile(datasetId, tableId, filename, projectId) {
   // [END bigquery_load_from_file]
 }
 
+function loadParquetFromGCS(datasetId, tableId, projectId) {
+  // [START bigquery_load_table_gcs_parquet]
+  // Imports the Google Cloud client libraries
+  const BigQuery = require('@google-cloud/bigquery');
+  const Storage = require('@google-cloud/storage');
+
+  /**
+   * TODO(developer): Uncomment the following lines before running the sample.
+   */
+  // const projectId = "your-project-id";
+  // const datasetId = "my_dataset";
+  // const tableId = "my_table";
+
+  /**
+   * This sample loads the Parquet file at
+   * https://storage.googleapis.com/cloud-samples-data/bigquery/us-states/us-states.parquet
+   *
+   * TODO(developer): Replace the following lines with the path to your file.
+   */
+  const bucketName = 'cloud-samples-data';
+  const filename = 'bigquery/us-states/us-states.parquet';
+
+  // Instantiates clients
+  const bigquery = new BigQuery({
+    projectId: projectId,
+  });
+
+  const storage = new Storage({
+    projectId: projectId,
+  });
+
+  // Configure the load job. For full list of options, see:
+  // https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.load
+  const metadata = {
+    sourceFormat: 'PARQUET',
+    schema: {
+      fields: [
+        {name: 'name', type: 'STRING'},
+        {name: 'post_abbr', type: 'STRING'},
+      ],
+    },
+  };
+
+  // Loads data from a Google Cloud Storage file into the table
+  bigquery
+    .dataset(datasetId)
+    .table(tableId)
+    .load(storage.bucket(bucketName).file(filename), metadata)
+    .then(results => {
+      const job = results[0];
+
+      // load() waits for the job to finish
+      assert.equal(job.status.state, 'DONE');
+      console.log(`Job ${job.id} completed.`);
+
+      // Check the job's status for errors
+      const errors = job.status.errors;
+      if (errors && errors.length > 0) {
+        throw errors;
+      }
+    })
+    .catch(err => {
+      console.error('ERROR:', err);
+    });
+  // [END bigquery_load_table_gcs_parquet]
+}
+
 function loadCSVFromGCS(datasetId, tableId, projectId) {
   // [START bigquery_load_table_gcs_csv]
   // Imports the Google Cloud client libraries
@@ -667,6 +734,14 @@ require(`yargs`)
       );
     }
   )
+  .command(
+    `load-gcs-parquet <projectId> <datasetId> <tableId>`,
+    `Loads sample Parquet data from a Google Cloud Storage file into a table.`,
+    {},
+    opts => {
+      loadParquetFromGCS(opts.datasetId, opts.tableId, opts.projectId);
+    }
+  )
   .command(
     `load-gcs-csv <projectId> <datasetId> <tableId>`,
     `Loads sample CSV data from a Google Cloud Storage file into a table.`,
diff --git a/handwritten/bigquery/src/table.js b/handwritten/bigquery/src/table.js
@@ -38,6 +38,7 @@ var FORMATS = {
   avro: 'AVRO',
   csv: 'CSV',
   json: 'NEWLINE_DELIMITED_JSON',
+  parquet: 'PARQUET',
 };
 
 /**
@@ -763,7 +764,7 @@ Table.prototype.createCopyFromJob = function(sourceTables, metadata, callback) {
  *     to. A string or a {@link https://cloud.google.com/nodejs/docs/reference/storage/latest/File File} object.
  * @param {object=} options - The configuration object.
  * @param {string} options.format - The format to export the data in. Allowed
- *     options are "CSV", "JSON", or "AVRO". Default: "CSV".
+ *     options are "CSV", "JSON", "AVRO", or "PARQUET". Default: "CSV".
  * @param {boolean} options.gzip - Specify if you would like the file compressed
  *     with GZIP. Default: false.
  * @param {string} [options.jobId] Custom job id.
diff --git a/handwritten/bigquery/test/table.js b/handwritten/bigquery/test/table.js
@@ -855,6 +855,16 @@ describe('BigQuery/Table', function() {
 
         table.createExtractJob(FILE, {format: 'avro'}, assert.ifError);
       });
+
+      it('should accept parquet', function(done) {
+        table.bigQuery.createJob = function(reqOpts) {
+          var extract = reqOpts.configuration.extract;
+          assert.equal(extract.destinationFormat, 'PARQUET');
+          done();
+        };
+
+        table.createExtractJob(FILE, {format: 'parquet'}, assert.ifError);
+      });
     });
 
     it('should parse out full gs:// urls from files', function(done) {