List of usage examples for org.apache.hadoop.fs FileSystem makeQualified
public Path makeQualified(Path path)
From source file:org.kitesdk.cli.commands.InputFormatImportCommand.java
License:Apache License
@Override @SuppressWarnings("unchecked") @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "DM_CONVERT_CASE", justification = "For record types only") public int run() throws IOException { Preconditions.checkArgument(targets != null && targets.size() == 2, "Data path and target dataset are required."); Path source = qualifiedPath(targets.get(0)); FileSystem sourceFS = source.getFileSystem(getConf()); Preconditions.checkArgument(sourceFS.exists(source), "Data path does not exist: " + source); // this throws IllegalArgumentException if the type is invalid. InputFormatUtil.RecordType.valueOf(recordType.trim().toUpperCase()); String dataset = targets.get(1); View<Object> target = load(dataset, Object.class); Schema schema = target.getDataset().getDescriptor().getSchema(); // Build a dataset around the incoming data DatasetDescriptor.Builder descriptorBuilder = new DatasetDescriptor.Builder().location(source.toUri()) .property(InputFormatUtil.INPUT_FORMAT_CLASS_PROP, inFormatClass) .property(InputFormatUtil.INPUT_FORMAT_RECORD_PROP, recordType).format(Formats.INPUTFORMAT) .schema(ColumnMappingParser// w w w. java 2 s. c o m .removeEmbeddedMapping(PartitionStrategyParser.removeEmbeddedStrategy(schema))); for (Map.Entry<String, String> prop : properties.entrySet()) { descriptorBuilder.property(prop.getKey(), prop.getValue()); } DatasetDescriptor inDescriptor = descriptorBuilder.build(); TemporaryFileSystemDatasetRepository repo = new TemporaryFileSystemDatasetRepository(getConf(), // ensure the same FS as the file source is used sourceFS.makeQualified(new Path("/tmp")), target.getDataset().getNamespace(), UUID.randomUUID().toString()); ClassLoader loader = loaderForJars(jars); try { FileSystemDataset<Object> inDataset = (FileSystemDataset) repo.create("import", "inputformat", inDescriptor); Iterator<Path> iter = inDataset.pathIterator().iterator(); Preconditions.checkArgument(iter.hasNext(), "Data path has no data files: " + source); TaskUtil.configure(getConf()).addJars(jars); TransformTask task; if (transform != null) { DoFn<Object, Object> transformFn; try { DynConstructors.Ctor<DoFn<Object, Object>> ctor = new DynConstructors.Builder(DoFn.class) .loader(loader).impl(transform).buildChecked(); transformFn = ctor.newInstance(); } catch (NoSuchMethodException e) { throw new DatasetException("Cannot find no-arg constructor for class: " + transform, e); } task = new TransformTask<Object, Object>(inDataset, target, transformFn); } else { task = new CopyTask<Object>(inDataset, target); } task.setConf(getConf()); if (noCompaction) { task.noCompaction(); } if (numWriters >= 0) { task.setNumWriters(numWriters); } PipelineResult result; try { result = runTaskWithClassLoader(task, loader); } catch (InterruptedException e) { // didn't finish return 1; } if (result.succeeded()) { long count = task.getCount(); if (count > 0) { console.info("Added {} records to \"{}\"", count, dataset); } return 0; } else { return 1; } } finally { repo.delete(); } }
From source file:org.kitesdk.cli.commands.JSONImportCommand.java
License:Apache License
@Override @SuppressWarnings("unchecked") public int run() throws IOException { Preconditions.checkArgument(targets != null && targets.size() == 2, "JSON path and target dataset name are required."); Path source = qualifiedPath(targets.get(0)); FileSystem sourceFS = source.getFileSystem(getConf()); Preconditions.checkArgument(sourceFS.exists(source), "JSON path does not exist: " + source); String dataset = targets.get(1); View<Record> target = load(dataset, Record.class); Schema datasetSchema = target.getDataset().getDescriptor().getSchema(); DatasetDescriptor jsonDescriptor = new DatasetDescriptor.Builder().location(source.toUri()) .schema(ColumnMappingParser/*from w w w. j a v a 2 s. c o m*/ .removeEmbeddedMapping(PartitionStrategyParser.removeEmbeddedStrategy(datasetSchema))) .format("json").build(); TemporaryFileSystemDatasetRepository repo = new TemporaryFileSystemDatasetRepository(getConf(), // ensure the same FS as the file source is used sourceFS.makeQualified(new Path("/tmp")), target.getDataset().getNamespace(), UUID.randomUUID().toString()); try { FileSystemDataset<Record> jsonDataset = (FileSystemDataset) repo.create("import", "json", jsonDescriptor); Iterator<Path> iter = jsonDataset.pathIterator().iterator(); Preconditions.checkArgument(iter.hasNext(), "JSON path has no data files: " + source); TaskUtil.configure(getConf()).addJars(jars); TransformTask task; if (transform != null) { DoFn<Record, Record> transformFn; try { DynConstructors.Ctor<DoFn<Record, Record>> ctor = new DynConstructors.Builder(DoFn.class) .loader(loaderForJars(jars)).impl(transform).buildChecked(); transformFn = ctor.newInstance(); } catch (NoSuchMethodException e) { throw new DatasetException("Cannot find no-arg constructor for class: " + transform, e); } task = new TransformTask<Record, Record>(jsonDataset, target, transformFn); } else { task = new CopyTask<Record>(jsonDataset, target); } task.setConf(getConf()); if (noCompaction) { task.noCompaction(); } if (numWriters >= 0) { task.setNumWriters(numWriters); } PipelineResult result = task.run(); if (result.succeeded()) { long count = task.getCount(); if (count > 0) { console.info("Added {} records to \"{}\"", count, dataset); } return 0; } else { return 1; } } finally { // clean up the temporary repository repo.delete(); } }
From source file:org.kitesdk.data.hcatalog.HiveUtils.java
License:Apache License
static DatasetDescriptor descriptorForTable(Configuration conf, Table table) { final DatasetDescriptor.Builder builder = new DatasetDescriptor.Builder(); final String serializationLib = table.getSerializationLib(); if (SERDE_TO_FORMAT.containsKey(serializationLib)) { builder.format(SERDE_TO_FORMAT.get(serializationLib)); } else {// ww w .j av a 2s .c om // TODO: should this use an "unknown" format? others fail in open() throw new UnknownFormatException("Unknown format for serde:" + serializationLib); } final Path dataLocation = table.getPath(); final FileSystem fs = fsForPath(conf, dataLocation); builder.location(fs.makeQualified(dataLocation)); // custom properties String namesProperty = coalesce(table.getProperty(CUSTOM_PROPERTIES_PROPERTY_NAME), table.getProperty(OLD_CUSTOM_PROPERTIES_PROPERTY_NAME)); if (namesProperty != null) { for (String property : NAME_SPLITTER.split(namesProperty)) { builder.property(property, table.getProperty(property)); } } if (table.isPartitioned()) { String partitionProperty = coalesce(table.getProperty(PARTITION_EXPRESSION_PROPERTY_NAME), table.getProperty(OLD_PARTITION_EXPRESSION_PROPERTY_NAME)); if (partitionProperty != null) { builder.partitionStrategy(Accessor.getDefault().fromExpression(partitionProperty)); } else { // build a partition strategy for the table from the Hive strategy builder.partitionStrategy(fromPartitionColumns(table.getPartCols())); } } String schemaUrlString = table.getProperty(AVRO_SCHEMA_URL_PROPERTY_NAME); if (schemaUrlString != null) { try { // URI.create is safe because this library wrote the URI builder.schemaUri(URI.create(schemaUrlString)); } catch (IOException e) { throw new DatasetIOException("Could not read schema", e); } } String schemaLiteral = table.getProperty(AVRO_SCHEMA_LITERAL_PROPERTY_NAME); if (schemaLiteral != null) { builder.schemaLiteral(schemaLiteral); } try { return builder.build(); } catch (IllegalStateException ex) { throw new DatasetException("Cannot find schema: missing metadata"); } }
From source file:org.kitesdk.data.mapreduce.FileSystemTestBase.java
License:Apache License
@Before public void setUp() throws Exception { Configuration conf = new Configuration(); FileSystem fileSystem = FileSystem.get(conf); Path testDirectory = fileSystem.makeQualified(new Path(Files.createTempDir().getAbsolutePath())); this.repo = new FileSystemDatasetRepository.Builder().configuration(conf).rootDirectory(testDirectory) .build();/*w w w .j a v a 2s.c o m*/ }
From source file:org.kitesdk.data.spi.filesystem.FileSystemDatasetRepository.java
License:Apache License
/** * Get a {@link org.kitesdk.data.spi.PartitionKey} corresponding to a partition's filesystem path * represented as a {@link URI}. If the path is not a valid partition, * then {@link IllegalArgumentException} is thrown. Note that the partition does not * have to exist.//from w w w . ja v a 2 s . c o m * @param dataset the filesystem dataset * @param partitionPath a directory path where the partition data is stored * @return a partition key representing the partition at the given path * @since 0.4.0 */ @SuppressWarnings({ "unchecked", "deprecation" }) public static PartitionKey partitionKeyForPath(Dataset dataset, URI partitionPath) { Preconditions.checkState(dataset.getDescriptor().isPartitioned(), "Attempt to get a partition on a non-partitioned dataset (name:%s)", dataset.getName()); Preconditions.checkArgument(dataset instanceof FileSystemDataset, "Dataset is not a FileSystemDataset"); FileSystemDataset fsDataset = (FileSystemDataset) dataset; FileSystem fs = fsDataset.getFileSystem(); URI partitionUri = fs.makeQualified(new Path(partitionPath)).toUri(); URI directoryUri = fsDataset.getDirectory().toUri(); URI relativizedUri = directoryUri.relativize(partitionUri); if (relativizedUri.equals(partitionUri)) { throw new IllegalArgumentException( String.format("Partition URI %s has different " + "root directory to dataset (directory: %s).", partitionUri, directoryUri)); } Iterable<String> parts = Splitter.on('/').split(relativizedUri.getPath()); PartitionStrategy partitionStrategy = dataset.getDescriptor().getPartitionStrategy(); List<FieldPartitioner> fieldPartitioners = Accessor.getDefault().getFieldPartitioners(partitionStrategy); if (Iterables.size(parts) > fieldPartitioners.size()) { throw new IllegalArgumentException( String.format("Too many partition directories " + "for %s (%s), expecting %s.", partitionUri, Iterables.size(parts), fieldPartitioners.size())); } Schema schema = dataset.getDescriptor().getSchema(); List<Object> values = Lists.newArrayList(); int i = 0; for (String part : parts) { Iterator<String> split = Splitter.on('=').split(part).iterator(); String fieldName = split.next(); FieldPartitioner fp = fieldPartitioners.get(i++); if (!fieldName.equals(fp.getName())) { throw new IllegalArgumentException( String.format("Unrecognized partition name " + "'%s' in partition %s, expecting '%s'.", fieldName, partitionUri, fp.getName())); } if (!split.hasNext()) { throw new IllegalArgumentException(String .format("Missing partition value for " + "'%s' in partition %s.", fieldName, partitionUri)); } String stringValue = split.next(); values.add(PathConversion.valueForDirname(fp, schema, stringValue)); } return new PartitionKey(values.toArray(new Object[values.size()])); }
From source file:org.kitesdk.data.spi.hive.HiveUtils.java
License:Apache License
static DatasetDescriptor descriptorForTable(Configuration conf, Table table) { final DatasetDescriptor.Builder builder = new DatasetDescriptor.Builder(); Format format;//from w w w .j a v a 2 s .c o m final String serializationLib = table.getSd().getSerdeInfo().getSerializationLib(); if (SERDE_TO_FORMAT.containsKey(serializationLib)) { format = SERDE_TO_FORMAT.get(serializationLib); builder.format(format); } else { // TODO: should this use an "unknown" format? others fail in open() throw new UnknownFormatException("Unknown format for serde:" + serializationLib); } final Path dataLocation = new Path(table.getSd().getLocation()); final FileSystem fs = fsForPath(conf, dataLocation); builder.location(fs.makeQualified(dataLocation)); // custom properties Map<String, String> properties = table.getParameters(); String namesProperty = coalesce(properties.get(CUSTOM_PROPERTIES_PROPERTY_NAME), properties.get(OLD_CUSTOM_PROPERTIES_PROPERTY_NAME)); if (namesProperty != null) { for (String property : NAME_SPLITTER.split(namesProperty)) { builder.property(property, properties.get(property)); } } PartitionStrategy partitionStrategy = null; if (isPartitioned(table)) { String partitionProperty = coalesce(properties.get(PARTITION_EXPRESSION_PROPERTY_NAME), properties.get(OLD_PARTITION_EXPRESSION_PROPERTY_NAME)); if (partitionProperty != null) { partitionStrategy = Accessor.getDefault().fromExpression(partitionProperty); } else { // build a partition strategy for the table from the Hive strategy partitionStrategy = fromPartitionColumns(getPartCols(table)); } builder.partitionStrategy(partitionStrategy); } String schemaUrlString = properties.get(AVRO_SCHEMA_URL_PROPERTY_NAME); if (schemaUrlString != null) { try { // URI.create is safe because this library wrote the URI builder.schemaUri(URI.create(schemaUrlString)); } catch (IOException e) { throw new DatasetIOException("Could not read schema", e); } } else { String schemaLiteral = properties.get(AVRO_SCHEMA_LITERAL_PROPERTY_NAME); if (schemaLiteral != null) { builder.schemaLiteral(schemaLiteral); } else { builder.schema(HiveSchemaConverter.convertTable(table.getTableName(), table.getSd().getCols(), partitionStrategy)); } } String compressionType = properties.get(COMPRESSION_TYPE_PROPERTY_NAME); if (compressionType != null) { builder.compressionType(compressionType); } try { return builder.build(); } catch (IllegalStateException ex) { throw new DatasetException("Cannot find schema: missing metadata"); } }
From source file:org.kitesdk.data.TestDatasetDescriptor.java
License:Apache License
@Test public void testSchemaFromHdfs() throws IOException { MiniDFSTest.setupFS();//from w w w. j a v a 2s. c o m FileSystem fs = MiniDFSTest.getDFS(); // copy a schema to HDFS Path schemaPath = fs.makeQualified(new Path("schema.avsc")); FSDataOutputStream out = fs.create(schemaPath); IOUtils.copyBytes(DatasetTestUtilities.USER_SCHEMA_URL.toURL().openStream(), out, fs.getConf()); out.close(); // build a schema using the HDFS path and check it's the same Schema schema = new DatasetDescriptor.Builder().schemaUri(schemaPath.toUri()).build().getSchema(); Assert.assertEquals(DatasetTestUtilities.USER_SCHEMA, schema); MiniDFSTest.teardownFS(); }
From source file:org.kitesdk.minicluster.HBaseService.java
License:Apache License
/** * Configure the HBase cluster before launching it * //from w w w .j ava 2 s .co m * @param config * already created Hadoop configuration we'll further configure for * HDFS * @param zkClientPort * The client port zookeeper is listening on * @param hdfsFs * The HDFS FileSystem this HBase cluster will run on top of * @param bindIP * The IP Address to force bind all sockets on. If null, will use * defaults * @param masterPort * The port the master listens on * @param regionserverPort * The port the regionserver listens on * @return The updated Configuration object. * @throws IOException */ private static Configuration configureHBaseCluster(Configuration config, int zkClientPort, FileSystem hdfsFs, String bindIP, int masterPort, int regionserverPort) throws IOException { // Configure the zookeeper port config.set(HConstants.ZOOKEEPER_CLIENT_PORT, Integer.toString(zkClientPort)); // Initialize HDFS path configurations required by HBase Path hbaseDir = new Path(hdfsFs.makeQualified(hdfsFs.getHomeDirectory()), "hbase"); FSUtils.setRootDir(config, hbaseDir); hdfsFs.mkdirs(hbaseDir); config.set("fs.defaultFS", hdfsFs.getUri().toString()); config.set("fs.default.name", hdfsFs.getUri().toString()); FSUtils.setVersion(hdfsFs, hbaseDir); // Configure the bind addresses and ports. If running in Openshift, we only // have permission to bind to the private IP address, accessible through an // environment variable. logger.info("HBase force binding to ip: " + bindIP); config.set("hbase.master.ipc.address", bindIP); config.set(HConstants.MASTER_PORT, Integer.toString(masterPort)); config.set("hbase.regionserver.ipc.address", bindIP); config.set(HConstants.REGIONSERVER_PORT, Integer.toString(regionserverPort)); config.set(HConstants.ZOOKEEPER_QUORUM, bindIP); // By default, the HBase master and regionservers will report to zookeeper // that its hostname is what it determines by reverse DNS lookup, and not // what we use as the bind address. This means when we set the bind // address, daemons won't actually be able to connect to eachother if they // are different. Here, we do something that's illegal in 48 states - use // reflection to override a private static final field in the DNS class // that is a cachedHostname. This way, we are forcing the hostname that // reverse dns finds. This may not be compatible with newer versions of // Hadoop. try { Field cachedHostname = DNS.class.getDeclaredField("cachedHostname"); cachedHostname.setAccessible(true); Field modifiersField = Field.class.getDeclaredField("modifiers"); modifiersField.setAccessible(true); modifiersField.setInt(cachedHostname, cachedHostname.getModifiers() & ~Modifier.FINAL); cachedHostname.set(null, bindIP); } catch (Exception e) { // Reflection can throw so many checked exceptions. Let's wrap in an // IOException. throw new IOException(e); } // By setting the info ports to -1 for, we won't launch the master or // regionserver info web interfaces config.set(HConstants.MASTER_INFO_PORT, "-1"); config.set(HConstants.REGIONSERVER_INFO_PORT, "-1"); return config; }
From source file:org.lilyproject.hadooptestfw.fork.HBaseTestingUtility.java
License:Apache License
/** * Creates an hbase rootdir in user home directory. Also creates hbase * version file. Normally you won't make use of this method. Root hbasedir * is created for you as part of mini cluster startup. You'd only use this * method if you were doing manual operation. * * @return Fully qualified path to hbase root dir *///from w w w .jav a 2 s. c om public Path createRootDir() throws IOException { FileSystem fs = FileSystem.get(this.conf); // Lily change: create "hbase" subdirectory under home directory // to serve as hbaseRootdir. The home directory can contain other // directories and files, which are not necessarily hbase tables. // For instance a 'target' dir created by MiniMRCluster. // Cfr. HBASE-5317 and HBASE-4025 Path hbaseRootdir = fs.makeQualified(new Path(fs.getHomeDirectory(), "hbase")); this.conf.set(HConstants.HBASE_DIR, hbaseRootdir.toString()); fs.mkdirs(hbaseRootdir); FSUtils.setVersion(fs, hbaseRootdir); return hbaseRootdir; }
From source file:org.lilyproject.testfw.HadoopLauncher.java
License:Apache License
public MiniHBaseCluster startMiniCluster(final int servers) throws Exception { // Make a new random dir to home everything in. Set it as system property. // minidfs reads home from system property. this.clusterTestBuildDir = setupClusterTestBuildDir(); System.setProperty(TEST_DIRECTORY_KEY, this.clusterTestBuildDir.getPath()); // Bring up mini dfs cluster. This spews a bunch of warnings about missing // scheme. Complaints are 'Scheme is undefined for build/test/data/dfs/name1'. startMiniDFSCluster(servers, this.clusterTestBuildDir); // Mangle conf so fs parameter points to minidfs we just started up FileSystem fs = this.dfsCluster.getFileSystem(); this.conf.set("fs.defaultFS", fs.getUri().toString()); // Do old style too just to be safe. this.conf.set("fs.default.name", fs.getUri().toString()); this.dfsCluster.waitClusterUp(); // Start up a zk cluster. if (this.zkCluster == null) { startMiniZKCluster(this.clusterTestBuildDir); }/* w w w. jav a 2s.c om*/ // Now do the mini hbase cluster. Set the hbase.rootdir in config. Path hbaseRootdir = fs.makeQualified(fs.getHomeDirectory()); this.conf.set(HConstants.HBASE_DIR, hbaseRootdir.toString()); fs.mkdirs(hbaseRootdir); FSUtils.setVersion(fs, hbaseRootdir); Configuration c = new Configuration(this.conf); this.hbaseCluster = new MiniHBaseCluster(c, servers); // Don't leave here till we've done a successful scan of the .META. HTable t = new HTable(c, HConstants.META_TABLE_NAME); ResultScanner s = t.getScanner(new Scan()); while (s.next() != null) continue; return this.hbaseCluster; }