Java tutorial
package tk.feelai.bigquery; /* * Copyright (c) 2017 Hiroshi Matsuda. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express * or implied. See the License for the specific language governing permissions and limitations under * the License. */ import com.google.api.client.auth.oauth2.Credential; import com.google.api.client.googleapis.auth.oauth2.GoogleCredential; import com.google.api.client.googleapis.javanet.GoogleNetHttpTransport; import com.google.api.client.googleapis.media.MediaHttpUploader; import com.google.api.client.googleapis.media.MediaHttpUploaderProgressListener; import com.google.api.client.http.HttpTransport; import com.google.api.client.http.InputStreamContent; import com.google.api.client.json.JsonFactory; import com.google.api.client.json.jackson2.JacksonFactory; import com.google.api.services.bigquery.Bigquery; import com.google.api.services.bigquery.Bigquery.Jobs.Insert; import com.google.api.services.bigquery.model.Dataset; import com.google.api.services.bigquery.model.DatasetList; import com.google.api.services.bigquery.model.DatasetList.Datasets; import com.google.api.services.bigquery.model.DatasetReference; import com.google.api.services.bigquery.model.Job; import com.google.api.services.bigquery.model.JobConfiguration; import com.google.api.services.bigquery.model.JobConfigurationLoad; import com.google.api.services.bigquery.model.JobList; import com.google.api.services.bigquery.model.JobReference; import com.google.api.services.bigquery.model.JobStatus; import com.google.api.services.bigquery.model.Table; import com.google.api.services.bigquery.model.TableFieldSchema; import com.google.api.services.bigquery.model.TableList; import com.google.api.services.bigquery.model.TableList.Tables; import com.google.api.services.bigquery.model.TableReference; import com.google.api.services.bigquery.model.TableSchema; import com.google.api.services.bigquery.BigqueryScopes; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileReader; import java.io.FilenameFilter; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Date; import java.util.LinkedList; import java.util.Queue; import java.util.concurrent.Callable; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.zip.ZipInputStream; public class CsvUploader { private static void usage() { System.err.println("Usage:"); System.err.println( "java tk.feelai.bigquery.CsvUploader project_id dataset_id service_account_email client_secret_p12_file_path dump_dir [(Mysqldump2csv_options | -stdin)]"); System.err.println(" dump_dir must contain results of Mysqldump2csv."); System.err.println( " dump_dir will be over-written by Dump2scv results if Mysqldump2csv_options or -stdin is specified."); } public static void main(String[] args) throws Exception { if (args.length < 5) { usage(); return; } String target = args[4]; if (args.length >= 6) { if ("-stdin".equalsIgnoreCase(args[5])) { Mysqldump2csv.main(new String[] { target }); } else { String[] dumpArgs = new String[args.length - 4]; System.arraycopy(args, 4, dumpArgs, 0, dumpArgs.length); Mysqldump2csv.main(dumpArgs); } } CsvUploader uploader = new CsvUploader(); System.err.print("authorizing ..."); uploader.authorize(args[0], args[1], args[2], new File(args[3])); System.err.println(" done"); uploader.prepareDataset(); uploader.uploadAll(new File(target), true, false, 0); } /** * ?????????10 */ private static <T> T autoRetry(int retryMax, Callable<T> callable) throws Exception { Exception lastException = null; for (int a = 0; a <= retryMax; a++) { try { return callable.call(); } catch (Exception e) { lastException = e; System.err.println(e); } if (a < retryMax) { try { System.err.println("auto-retry in 10 seconds"); Thread.sleep(10000); } catch (Exception e) { } } } throw new IllegalStateException(lastException); } /** * schemaFile format is "field_name\tfield_type\n" */ private static TableSchema loadSchema(File schemaFile) throws IOException { TableSchema schema = new TableSchema(); ArrayList<TableFieldSchema> fields = new ArrayList<TableFieldSchema>(); BufferedReader in = new BufferedReader(new FileReader(schemaFile)); try { String line; while ((line = in.readLine()) != null) { String[] f = line.split("\t"); TableFieldSchema s = new TableFieldSchema(); s.setName(f[0]); s.setType(f[1]); fields.add(s); } schema.setFields(fields); } finally { in.close(); } return schema; } private static final JsonFactory JSON_FACTORY = JacksonFactory.getDefaultInstance(); private static final Pattern FILE_NAME_PATTERN = Pattern.compile("^([^.]+)\\..+$"); private HttpTransport httpTransport; private String projectId; private String datasetId; private Credential credential; private Bigquery bigquery; private Dataset dataset; public CsvUploader() throws Exception { httpTransport = GoogleNetHttpTransport.newTrustedTransport(); } /** * Authorizes the installed application to access user's protected data. */ public void authorize(String projectId, String datasetId, String accountId, File clientSecret) throws Exception { credential = new GoogleCredential.Builder().setTransport(httpTransport).setJsonFactory(JSON_FACTORY) .setServiceAccountId(accountId) .setServiceAccountScopes(Collections.singleton(BigqueryScopes.BIGQUERY)) .setServiceAccountPrivateKeyFromP12File(clientSecret).build(); this.projectId = projectId; this.datasetId = datasetId; } /** * Bigquery?Dataset? */ public void prepareDataset() throws Exception { System.err.print("connecting to " + projectId + " ..."); bigquery = new Bigquery.Builder(httpTransport, JSON_FACTORY, credential) .setApplicationName("tk.feelai.bigquery.CsvUploader").build(); System.err.println(" done"); System.err.print("getting dataset list ..."); DatasetList datasetList = bigquery.datasets().list(projectId).execute(); System.err.println(" done"); try { for (Datasets d : datasetList.getDatasets()) { if (String.format("%s:%s", projectId, datasetId).equals(d.getId())) { dataset = autoRetry(6, new Callable<Dataset>() { @Override public Dataset call() throws Exception { return bigquery.datasets().get(projectId, datasetId).execute(); } }); break; } } } catch (NullPointerException e) { } if (dataset == null) { System.err.print("creating " + datasetId + " ..."); DatasetReference datasetRef = new DatasetReference().setProjectId(projectId).setDatasetId(datasetId); final Dataset outputDataset = new Dataset().setDatasetReference(datasetRef); dataset = autoRetry(6, new Callable<Dataset>() { @Override public Dataset call() throws Exception { return bigquery.datasets().insert(projectId, outputDataset).execute(); } }); System.err.println(" done"); } else { System.err.println("attached to " + datasetId); } JobList jobList = bigquery.jobs().list(projectId).execute(); System.err.println("recent job list is below:"); System.err.println(jobList.getJobs()); } /** * * @param dir *.schema?*.csv?? * @param resumable true1M????false??????????????? * @param useGZipContent falsetrue?????? * @param maxBadRecords ????0 */ public void uploadAll(final File dir, boolean resumable, boolean useGZipContent, int maxBadRecords) throws Exception { System.err.println("upload target directory is " + dir + ". options: resumable=" + resumable + ", useGZipContent=" + useGZipContent + ", maxBadRecords=" + maxBadRecords); Queue<File> queue = new LinkedList<File>(); File[] files = dir.listFiles(new FilenameFilter() { @Override public boolean accept(File dir, String name) { return name.endsWith(".schema"); } }); Arrays.sort(files); for (final File file : files) { queue.add(file); } while (queue.size() > 0) { final File schema = queue.poll(); final String fileName = schema.getName(); Matcher matcher = FILE_NAME_PATTERN.matcher(fileName); if (!matcher.matches()) { throw new IllegalStateException(); } final String tableName = matcher.group(1); try { uploadTable(tableName, schema, resumable, useGZipContent, maxBadRecords); } catch (Exception e) { queue.add(schema); System.err.println("Exception occured and appended to retry queue:"); e.printStackTrace(System.err); } } System.err.println("upload completed in " + dir); } /** * schema??csv????ID??0????ID??????????????????? * ???Bigquery???????insert? * @param tableName ??"${tableName}.([0-9]+.)?.csv"????? * @param schema ????csv * @param resumable true1M????false??????????????? * @param useGZipContent falsetrue?????? * @param maxBadRecords ????0 */ public void uploadTable(final String tableName, final File schema, boolean resumable, boolean useGZipContent, int maxBadRecords) throws Exception { System.err.print("getting table list ..."); TableList tableList = bigquery.tables().list(projectId, datasetId).execute(); System.err.println(" done"); File dir = schema.getParentFile(); boolean renew = Mysqldump2csv.firstCsvExists(dir, tableName), exists = false; try { for (Tables t : tableList.getTables()) { if (String.format("%s:%s.%s", projectId, datasetId, tableName).equals(t.getId())) { exists = true; if (renew) { System.err.print("deleting " + tableName + " table ..."); bigquery.tables().delete(projectId, datasetId, tableName).execute(); System.err.println(" done"); } break; } } } catch (NullPointerException e) { System.err.println(" NullPointerException ignored"); } final TableReference tref = insertTable(tableName, loadSchema(schema), renew || !exists); uploadCsvIntoTable(dir, tableName, tref, resumable, useGZipContent, maxBadRecords); } public TableReference insertTable(String tableName, TableSchema schema, boolean create) throws Exception { TableReference tref = new TableReference().setProjectId(projectId).setDatasetId(datasetId) .setTableId(tableName); if (!create) { return tref; } System.err.print("inserting " + tableName + " table ..."); final Table table = new Table().setSchema(schema).setTableReference(tref); autoRetry(6, new Callable<Void>() { @Override public Void call() throws Exception { bigquery.tables().insert(projectId, datasetId, table).execute(); return null; } }); System.err.println(" done"); return tref; } /** * ?csv????insert?Bigquery????????????????? * @param dir csv????? * @param tableName ??Mysqldump2csv.CSV_FILE_NAME_PATTERN???? * @param tref ? * @param resumable true1M????false??????????????? * @param useGZipContent falsetrue?????? * @param maxBadRecords ????0 * @throws Exception ???10????throw? */ public void uploadCsvIntoTable(File dir, final String tableName, TableReference tref, final boolean resumable, final boolean useGZipContent, final int maxBadRecords) throws Exception { System.err.println(" upload records into " + tableName); JobConfigurationLoad jobLoad = new JobConfigurationLoad().setDestinationTable(tref) .setCreateDisposition("CREATE_NEVER").setWriteDisposition("WRITE_APPEND") .setMaxBadRecords(maxBadRecords).setSourceUris(null); JobConfiguration jobConfig = new JobConfiguration().setLoad(jobLoad); JobReference jobRef = new JobReference().setProjectId(projectId); final Job outputJob = new Job().setConfiguration(jobConfig).setJobReference(jobRef); final Pattern fileNamePattern = Pattern .compile(String.format(Mysqldump2csv.CSV_FILE_NAME_PATTERN, tableName)); File[] csvs = dir.listFiles(new FilenameFilter() { @Override public boolean accept(File dir, String name) { return fileNamePattern.matcher(name).matches(); } }); if (csvs.length == 0) { System.err.println(" no record found in " + tableName); } Arrays.sort(csvs); long totalSize = 0; for (File csv : csvs) { totalSize += csv.length(); } System.err.println(String.format(" total %d files, %,3dkB", csvs.length, totalSize / 1000)); final UploadAdaptor ua = new UploadAdaptor(totalSize); for (final File csv : csvs) { autoRetry(10, new Callable<Void>() { @Override public Void call() throws Exception { System.err.print(String.format(" target=" + csv + " .")); Matcher matcher = fileNamePattern.matcher(csv.getName()); if (!matcher.matches()) { throw new IllegalStateException(); } String zip = matcher.group(2); InputStream in; if (zip == null || zip.length() == 0) { in = new BufferedInputStream(new FileInputStream(csv)); } else { in = new ZipInputStream(new FileInputStream(csv)); } try { InputStreamContent mediaContent = new InputStreamContent("application/octet-stream", in); // GCP???resumable????????gzip??? if (!resumable || !useGZipContent) { mediaContent.setLength(csv.length()); } Insert insert = bigquery.jobs().insert(projectId, outputJob, mediaContent); insert.getMediaHttpUploader().setDirectUploadEnabled(!resumable) .setDisableGZipContent(!useGZipContent).setProgressListener(ua); JobStatus status = insert.execute().getStatus(); if (status.getErrors() != null && status.getErrors().size() > 0) { throw new IllegalStateException("job has error(s) " + status.getErrors()); } ua.uploadedSize += csv.length(); return null; } finally { in.close(); } } }); } } private static class UploadAdaptor implements MediaHttpUploaderProgressListener { final long totalSize; long uploadedSize, prev, prevSize; UploadAdaptor(long totalSize) { this.totalSize = totalSize; } public void progressChanged(MediaHttpUploader uploader) throws IOException { switch (uploader.getUploadState()) { case INITIATION_STARTED: System.err.print("."); prev = System.currentTimeMillis(); prevSize = 0; break; case INITIATION_COMPLETE: System.err.print("."); break; case MEDIA_IN_PROGRESS: long now = System.currentTimeMillis(); if (now - prev >= 60000) { long size = uploader.getNumBytesUploaded(); System.err.println(); System.err.print(String.format(" %,3dkB, %.2f%% (%,3dkB/sec), " + new Date(now) + " .", (uploadedSize + size) / 1000, (uploadedSize + size) * 100f / totalSize, (size - prevSize) / (now - prev))); prev = now; prevSize = size; } else { System.err.print("."); } break; case MEDIA_COMPLETE: System.err.println(); break; case NOT_STARTED: throw new IllegalStateException("upload not started"); } } }; }