Java tutorial
/******************************************************************************** * MIT Java Wordnet Interface Library (JWI) v2.3.3 * Copyright (c) 2007-2014 Massachusetts Institute of Technology * * JWI is distributed under the terms of the Creative Commons Attribution 3.0 * Unported License, which means it may be freely used for all purposes, as long * as proper acknowledgment is made. See the license file included with this * distribution for more details. *******************************************************************************/ package edu.mit.jwi.data; import java.io.BufferedReader; import java.io.File; import java.io.FileFilter; import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.URI; import java.net.URL; import java.net.URLDecoder; import java.nio.file.Files; import java.nio.file.Paths; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import edu.mit.jwi.RAMDictionary; import edu.mit.jwi.data.parse.ILineParser; import edu.mit.jwi.item.ISynset; import edu.mit.jwi.item.IVersion; import edu.mit.jwi.item.Synset; /** * <p> * Implementation of a data provider for Wordnet that uses files in the file * system to back instances of its data sources. This implementation takes a * <code>URL</code> to a file system directory as its path argument, and uses * the resource hints from the data types and parts of speech for its content * types to examine the filenames in the that directory to determine which files * contain which data. * </p> * <p> * This implementation supports loading the wordnet files into memory, but this * is actually not that beneficial for speed. This is because the implementation * loads the file data into memory uninterpreted, and on modern machines, the * time to interpret a line of data (i.e., parse it into a Java object) is much * larger than the time it takes to load the line from disk. Those wishing to * achieve speed increases from loading Wordnet into memory should rely on the * implementation in {@link RAMDictionary}, or something similar, which * pre-processes the Wordnet data into objects before caching them. * </p> * * @author Mark A. Finlayson * @version 2.3.3-hadoop * @since JWI 1.0 */ public class FileProvider implements IDataProvider, ILoadable, ILoadPolicy { // final instance fields private final Lock lifecycleLock = new ReentrantLock(); private final Lock loadingLock = new ReentrantLock(); private final Set<IContentType<?>> types; // instance fields private URL url = null; private IVersion version = null; private Map<IContentType<?>, ILoadableDataSource<?>> fileMap = null; private int loadPolicy = NO_LOAD; private transient JWIBackgroundLoader loader = null; public static String ReadFileFromHdfs(FileSystem fs, Path path) { BufferedReader br = null; StringBuilder sb = null; try { br = new BufferedReader(new InputStreamReader(fs.open(path))); sb = new StringBuilder(); String line = br.readLine(); while (line != null) { sb.append(line); sb.append("\n"); line = br.readLine(); } return sb.toString(); } catch (Exception e) { e.printStackTrace(); } finally { if (br != null) { try { br.close(); } catch (Exception e) { } } } return sb.toString(); } static String ReadFile(String path) throws IOException { byte[] encoded = Files.readAllBytes(Paths.get(path)); return new String(encoded); } /** * Make a File from Path * * from * http://stackoverflow.com/questions/3444313/how-to-convert-a-hadoop-path- * object-into-a-java-file-object * * @param some_path * @param conf * @author Mauro Pelucchi * @since JWI 2.3.3-hadoop * @return */ public static File MakeFileFromPath(Path hdfsPath, Configuration conf) { try { FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf); File tempFolder = File.createTempFile(hdfsPath.getName(), ""); if (!(tempFolder.delete())) { throw new IOException("Could not delete temp file: " + tempFolder.getAbsolutePath()); } if (!(tempFolder.mkdir())) { throw new IOException("Could not create temp directory: " + tempFolder.getAbsolutePath()); } FileStatus[] status = fs.listStatus(hdfsPath); for (int i = 0; i < status.length; i++) { System.out.println("------------------------------------"); if (status[i].isFile()) { System.out.println(status[i].getPath()); fs.copyToLocalFile(false, status[i].getPath(), new Path(tempFolder.getAbsolutePath())); //System.out.println(ReadFileFromHdfs(fs, status[i].getPath())); } } tempFolder.deleteOnExit(); File[] files = tempFolder.listFiles(); for (int i = 0; i < files.length; i++) { System.out.println("------------------------------------"); System.out.println(files[i].getPath()); //System.out.println(ReadFile(files[i].getPath())); if (files[i].getName().startsWith(".")) { System.out.println("Delete --> " + files[i].getPath()); if (!(files[i].delete())) { throw new IOException("Could not delete temp file: " + files[i].getAbsolutePath()); } } } return tempFolder; } catch (Exception e) { e.printStackTrace(); } return null; } /** * Constructs the file provider pointing to the hdfs resource indicated by * the path. This method require Configuration of Hadoop to read from hdfs * * @param file * A file pointing to the wordnet directory, may not be * <code>null</code> * @throws NullPointerException * if the specified file is <code>null</code> * @since JWI 2.3.3-hadoop * @author Mauro Pelucchi * */ public FileProvider(String path, Configuration conf) { this(toURL(MakeFileFromPath(new Path(path), conf))); } /** * Constructs the file provider pointing to the resource indicated by the * path. This file provider has an initial {@link ILoadPolicy#NO_LOAD} load * policy. * * @param file * A file pointing to the wordnet directory, may not be * <code>null</code> * @throws NullPointerException * if the specified file is <code>null</code> * @since JWI 1.0 */ public FileProvider(File file) { this(toURL(file)); } /** * Constructs the file provider pointing to the resource indicated by the * path, with the specified load policy. * * @param file * A file pointing to the wordnet directory, may not be * <code>null</code> * @param loadPolicy * the load policy for this provider; this provider supports the * three values defined in <code>ILoadPolicy</code>. * @throws NullPointerException * if the specified file is <code>null</code> * @since JWI 2.2.0 */ public FileProvider(File file, int loadPolicy) { this(toURL(file), loadPolicy, ContentType.values()); } /** * Constructs the file provider pointing to the resource indicated by the * path, with the specified load policy, looking for the specified content * type.s * * @param file * A file pointing to the wordnet directory, may not be * <code>null</code> * @param loadPolicy * the load policy for this provider; this provider supports the * three values defined in <code>ILoadPolicy</code>. * @param types * the content types this provider will look for when it loads * its data; may not be <code>null</code> or empty * @throws NullPointerException * if the file or content type collection is <code>null</code> * @throws IllegalArgumentException * if the set of types is empty * @since JWI 2.2.0 */ public FileProvider(File file, int loadPolicy, Collection<? extends IContentType<?>> types) { this(toURL(file), loadPolicy, types); } /** * Constructs the file provider pointing to the resource indicated by the * path. This file provider has an initial {@link ILoadPolicy#NO_LOAD} load * policy. * * @param url * A file URL in UTF-8 decodable format, may not be * <code>null</code> * @throws NullPointerException * if the specified URL is <code>null</code> * @since JWI 1.0 */ public FileProvider(URL url) { this(url, NO_LOAD); } /** * Constructs the file provider pointing to the resource indicated by the * path, with the specified load policy. * * @param url * A file URL in UTF-8 decodable format, may not be * <code>null</code> * @param loadPolicy * the load policy for this provider; this provider supports the * three values defined in <code>ILoadPolicy</code>. * @throws NullPointerException * if the specified URL is <code>null</code> * @since JWI 2.2.0 */ public FileProvider(URL url, int loadPolicy) { this(url, loadPolicy, ContentType.values()); } /** * Constructs the file provider pointing to the resource indicated by the * path, with the specified load policy, looking for the specified content * type.s * * @param url * A file URL in UTF-8 decodable format, may not be * <code>null</code> * @param loadPolicy * the load policy for this provider; this provider supports the * three values defined in <code>ILoadPolicy</code>. * @param types * the content types this provider will look for when it loads * its data; may not be <code>null</code> or empty * @throws NullPointerException * if the url or content type collection is <code>null</code> * @throws IllegalArgumentException * if the set of types is empty * @since JWI 2.2.0 */ public FileProvider(URL url, int loadPolicy, Collection<? extends IContentType<?>> types) { if (url == null) throw new NullPointerException(); if (types.isEmpty()) throw new IllegalArgumentException(); this.url = url; this.loadPolicy = loadPolicy; this.types = Collections.unmodifiableSet(new HashSet<IContentType<?>>(types)); } /* * (non-Javadoc) * * @see edu.mit.jwi.data.IDataProvider#getSource() */ public URL getSource() { return url; } /* * (non-Javadoc) * * @see edu.mit.jwi.data.ILoadPolicy#getLoadPolicy() */ public int getLoadPolicy() { return loadPolicy; } /* * (non-Javadoc) * * @see edu.mit.jwi.data.IDataProvider#setSource(java.net.URL) */ public void setSource(URL url) { if (isOpen()) throw new IllegalStateException("provider currently open"); if (url == null) throw new NullPointerException(); this.url = url; } /* * (non-Javadoc) * * @see edu.mit.jwi.data.ILoadPolicy#setLoadPolicy(int) */ public void setLoadPolicy(int policy) { try { loadingLock.lock(); this.loadPolicy = policy; } finally { loadingLock.unlock(); } } /* * (non-Javadoc) * * @see edu.mit.jwi.item.IHasVersion#getVersion() */ public IVersion getVersion() { checkOpen(); if (version == null) version = determineVersion(fileMap.values()); if (version == IVersion.NO_VERSION) return null; return version; } /** * Determines a version from the set of data sources, if possible, otherwise * returns {@link IVersion#NO_VERSION} * * @param srcs * the data sources to be used to determine the verison * @return the single version that describes these data sources, or * {@link IVersion#NO_VERSION} if there is none * @since JWI 2.1.0 */ protected IVersion determineVersion(Collection<? extends IDataSource<?>> srcs) { IVersion ver = IVersion.NO_VERSION; for (IDataSource<?> dataSrc : srcs) { // if no version to set, ignore if (dataSrc.getVersion() == null) continue; // init version if (ver == IVersion.NO_VERSION) { ver = dataSrc.getVersion(); continue; } // if version different from current if (!ver.equals(dataSrc.getVersion())) return IVersion.NO_VERSION; } return ver; } /* * (non-Javadoc) * * @see edu.mit.jwi.data.IHasLifecycle#open() */ public boolean open() throws IOException { try { lifecycleLock.lock(); loadingLock.lock(); int policy = getLoadPolicy(); // make sure directory exists File directory = toFile(url); if (!directory.exists()) throw new IOException("Dictionary directory does not exist: " + directory); // get files in directory List<File> files = new ArrayList<File>(Arrays.asList(directory.listFiles(new FileFilter() { public boolean accept(File file) { return file.isFile(); } }))); if (files.isEmpty()) throw new IOException("No files found in " + directory); // make the source map Map<IContentType<?>, ILoadableDataSource<?>> hiddenMap = createSourceMap(files, policy); if (hiddenMap.isEmpty()) return false; // determine if it's already unmodifiable, wrap if not Map<?, ?> map = Collections.unmodifiableMap(Collections.emptyMap()); if (hiddenMap.getClass() != map.getClass()) hiddenMap = Collections.unmodifiableMap(hiddenMap); this.fileMap = hiddenMap; // do load try { switch (loadPolicy) { case BACKGROUND_LOAD: load(false); break; case IMMEDIATE_LOAD: load(true); break; default: // do nothing } } catch (InterruptedException e) { e.printStackTrace(); } return true; } finally { lifecycleLock.unlock(); loadingLock.unlock(); } } /* * (non-Javadoc) * * @see edu.mit.jwi.data.ILoadable#load() */ public void load() { try { load(false); } catch (InterruptedException e) { e.printStackTrace(); } } /* * (non-Javadoc) * * @see edu.mit.jwi.data.ILoadable#load(boolean) */ public void load(boolean block) throws InterruptedException { try { loadingLock.lock(); checkOpen(); if (isLoaded()) return; if (loader != null) return; loader = new JWIBackgroundLoader(); loader.start(); if (block) loader.join(); } finally { loadingLock.lock(); } } /* * (non-Javadoc) * * @see edu.mit.jwi.data.ILoadable#isLoaded() */ public boolean isLoaded() { if (!isOpen()) throw new IllegalStateException("provider not open"); try { loadingLock.lock(); for (ILoadableDataSource<?> source : fileMap.values()) if (!source.isLoaded()) return false; return true; } finally { loadingLock.unlock(); } } /** * Creates the map that contains the content types mapped to the data * sources. The method should return a non-null result, but it may be empty * if no data sources can be created. Subclasses may override this method. * * @param files * the files from which the data sources should be created, may * not be <code>null</code> * @param policy * the load policy of the provider * @return a map, possibly empty, but not <code>null</code>, of content * types mapped to data sources * @throws NullPointerException * if the file list is <code>null</code> * @throws IOException * if there is a problem creating the data source * @since JWI 2.2.0 */ protected Map<IContentType<?>, ILoadableDataSource<?>> createSourceMap(List<File> files, int policy) throws IOException { Map<IContentType<?>, ILoadableDataSource<?>> result = new HashMap<IContentType<?>, ILoadableDataSource<?>>(); File file; for (IContentType<?> type : types) { file = DataType.find(type.getDataType(), type.getPOS(), files); if (file == null) continue; files.remove(file); result.put(type, createDataSource(file, type, policy)); } return result; } /** * Creates the actual data source implementations. * * @param <T> * the content type of the data source * @param file * the file from which the data source should be created, may not * be <code>null</code> * @param type * the content type of the data source * @param policy * the load policy to follow when creating the data source * @return the created data source * @throws NullPointerException * if any argument is <code>null</code> * @throws IOException * if there is an IO problem when creating the data source * @since JWI 2.2.0 */ protected <T> ILoadableDataSource<T> createDataSource(File file, IContentType<T> type, int policy) throws IOException { ILoadableDataSource<T> src; if (type.getDataType() == DataType.DATA) { src = createDirectAccess(file, type); src.open(); if (policy == IMMEDIATE_LOAD) { try { src.load(true); } catch (InterruptedException e) { e.printStackTrace(); } } // check to see if direct access works with the file // often people will extract the files incorrectly on windows // machines // and the binary files will be corrupted with extra CRs // get first line Iterator<String> itr = src.iterator(); String firstLine = itr.next(); if (firstLine == null) return src; // extract key ILineParser<T> parser = type.getDataType().getParser(); ISynset s = (ISynset) parser.parseLine(firstLine); String key = Synset.zeroFillOffset(s.getOffset()); // try to find line by direct access String soughtLine = src.getLine(key); if (soughtLine != null) return src; System.err.println(System.currentTimeMillis() + " - Error on direct access in " + type.getPOS().toString() + " data file: check CR/LF endings"); } src = createBinarySearch(file, type); src.open(); if (policy == IMMEDIATE_LOAD) { try { src.load(true); } catch (InterruptedException e) { e.printStackTrace(); } } return src; } /** * Creates a direct access data source for the specified type, using the * specified file. * * @param <T> * the parameter of the content type * @param file * the file on which the data source is based; may not be * <code>null</code> * @param type * the data type for the data source; may not be * <code>null</code> * @return the data source * @throws NullPointerException * if either argument is <code>null</code> * @throws IOException * if there is an IO problem when creating the data source * object * @since JWI 2.2.0 */ protected <T> ILoadableDataSource<T> createDirectAccess(File file, IContentType<T> type) throws IOException { return new DirectAccessWordnetFile<T>(file, type); } /** * Creates a binary search data source for the specified type, using the * specified file. * * @param <T> * the parameter of the content type * @param file * the file on which the data source is based; may not be * <code>null</code> * @param type * the data type for the data source; may not be * <code>null</code> * @return the data source * @throws NullPointerException * if either argument is <code>null</code> * @throws IOException * if there is an IO problem when creating the data source * object * @since JWI 2.2.0 */ protected <T> ILoadableDataSource<T> createBinarySearch(File file, IContentType<T> type) throws IOException { return new BinarySearchWordnetFile<T>(file, type); } /* * (non-Javadoc) * * @see edu.mit.jwi.data.IHasLifecycle#isOpen() */ public boolean isOpen() { try { lifecycleLock.lock(); return fileMap != null; } finally { lifecycleLock.unlock(); } } /* * (non-Javadoc) * * @see edu.mit.jwi.data.IClosable#close() */ public void close() { try { lifecycleLock.lock(); if (!isOpen()) return; if (loader != null) loader.cancel(); for (IDataSource<?> source : fileMap.values()) source.close(); fileMap = null; } finally { lifecycleLock.unlock(); } } /** * Convenience method that throws an exception if the provider is closed. * * @throws ObjectClosedException * if the provider is closed * @since JWI 1.1 */ protected void checkOpen() { if (!isOpen()) throw new ObjectClosedException(); } /* * (non-Javadoc) * * @see * edu.mit.jwi.data.IDataProvider#getSource(edu.mit.jwi.data.IContentType) */ // no way to safely cast; must rely on registerSource method to assure // compliance @SuppressWarnings("unchecked") public <T> ILoadableDataSource<T> getSource(IContentType<T> type) { checkOpen(); return (ILoadableDataSource<T>) fileMap.get(type); } /* * (non-Javadoc) * * @see edu.mit.jwi.data.IDataProvider#getTypes() */ public Set<? extends IContentType<?>> getTypes() { return types; } /** * A thread class which tries to load each data source in this provider. * * @author Mark A. Finlayson * @version 2.3.3 * @since JWI 2.2.0 */ protected class JWIBackgroundLoader extends Thread { // cancel flag private transient boolean cancel = false; /** * Constructs a new background loader that operates on the internal data * structures of this provider. * * @since JWI 2.2.0 */ public JWIBackgroundLoader() { setName(JWIBackgroundLoader.class.getSimpleName()); setDaemon(true); } /* * (non-Javadoc) * * @see java.lang.Thread#run() */ @Override public void run() { try { for (ILoadableDataSource<?> source : fileMap.values()) { if (!cancel && !source.isLoaded()) { try { source.load(true); } catch (InterruptedException e) { e.printStackTrace(); } } } } finally { loader = null; } } /** * Sets the cancel flag for this loader. * * @since JWI 2.2.0 */ public void cancel() { cancel = true; try { join(); } catch (InterruptedException e) { e.printStackTrace(); } } } /** * Transforms a URL into a File. The URL must use the 'file' protocol and * must be in a UTF-8 compatible format as specified in * {@link java.net.URLDecoder}. * * @return a file pointing to the same place as the url * @throws NullPointerException * if the url is <code>null</code> * @throws IllegalArgumentException * if the url does not use the 'file' protocol * @since JWI 1.0 */ public static File toFile(URL url) throws IOException { if (!url.getProtocol().equals("file")) throw new IllegalArgumentException("URL source must use 'file' protocol"); try { return new File(URLDecoder.decode(url.getPath(), "UTF-8")); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } } /** * Transforms a file into a URL. * * @param file * the file to be transformed * @return a URL representing the file * @throws NullPointerException * if the specified file is <code>null</code> * @since JWI 2.2.0 */ public static URL toURL(File file) { if (file == null) throw new NullPointerException(); try { // URI uri = new URI("file", "//", file.toURL().getPath() , null); URI uri = file.toPath().toUri(); return new URL("file", null, uri.getRawPath()); } catch (IOException e) { e.printStackTrace(); return null; } } }