Java tutorial
/******************************************************************************* * Copyright (c) 2015, 2016 Naveen Kulkarni * * This file is part of Bag of Words program. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * * Contributors: * Naveen Kulkarni (naveen.kulkarni@research.iiit.ac.in) * *******************************************************************************/ package ctrus.pa.bow.java; import java.util.ArrayList; import java.util.List; import java.util.Map; import org.apache.commons.lang3.StringUtils; import org.eclipse.jdt.core.JavaCore; import org.eclipse.jdt.core.dom.AST; import org.eclipse.jdt.core.dom.ASTNode; import org.eclipse.jdt.core.dom.ASTParser; import org.eclipse.jdt.core.dom.ASTVisitor; import org.eclipse.jdt.core.dom.AnnotationTypeDeclaration; import org.eclipse.jdt.core.dom.Comment; import org.eclipse.jdt.core.dom.CompilationUnit; import org.eclipse.jdt.core.dom.EnumDeclaration; import org.eclipse.jdt.core.dom.NormalAnnotation; import org.eclipse.jdt.core.dom.PackageDeclaration; import org.eclipse.jdt.core.dom.TypeDeclaration; import ctrus.pa.bow.java.token.ClassTokens; import ctrus.pa.bow.java.token.IdentifierTokens; import ctrus.pa.bow.java.token.IdentifiersPosition; import ctrus.pa.bow.java.token.MethodTokens; public class JavaFileTokenizer extends ASTVisitor { private boolean _init = false; private boolean _ignoreComments = false; private boolean _considerCopyright = false; private boolean _stateAnalysis = false; private ASTParser _javaParser = null; private CompilationUnit _cu = null; private String _packageName = null; private List<ClassTokens> _classTokens = null; private IdentifiersPosition _positionObserver = null; public String getPackageName() { return _packageName; } public Iterable<ClassTokens> getClassTokens() { return _classTokens; } public void setIgnoreComments(boolean ignore) { _ignoreComments = ignore; } public void setConsiderCopyright(boolean consider) { _considerCopyright = consider; } public void setStateAnalysis(boolean stateAnalysis) { _stateAnalysis = stateAnalysis; } @SuppressWarnings("unchecked") public void tokenize(List<String> sourceTextLines) { // initialize if not done earlier if (!_init) _init(); // Processing a new file, reset tokens _classTokens.clear(); _positionObserver = new IdentifiersPosition(); // Normalize single line comments to block comments StringBuffer sourceText = _preProcessSourceText(sourceTextLines); // Extract all identifiers from java source text _javaParser.setSource(sourceText.toString().toCharArray()); _cu = (CompilationUnit) _javaParser.createAST(null); _cu.accept(this); // Extract all comments from java source text // and merge them with their corresponding identifiers if (!_ignoreComments && !_stateAnalysis) { // Extract List<Comment> comments = (List<Comment>) _cu.getCommentList(); for (Comment comment : comments) { int start = comment.getStartPosition(); int end = start + comment.getLength(); String identifierOfComment = _positionObserver.getIdentifierForPosition(start, end); String commentText = sourceText.substring(start, end).replaceAll("[\\t\\n\\r]", " "); // Do filtering of comments if (!_considerCopyright) { if (commentText.toLowerCase().contains("copyright") || commentText.toLowerCase().contains("license")) continue; } // More filtering rules...TBD IdentifierTokens it = getTokens(identifierOfComment); it.addCommentToken(commentText); } } // Add package information if (!_stateAnalysis) { for (ClassTokens c : _classTokens) c.addToken(_packageName); } } private void _init() { _javaParser = ASTParser.newParser(AST.JLS8); _javaParser.setKind(ASTParser.K_COMPILATION_UNIT); Map options = JavaCore.getOptions(); JavaCore.setComplianceOptions(JavaCore.VERSION_1_8, options); _javaParser.setCompilerOptions(options); _classTokens = new ArrayList<ClassTokens>(); _init = true; } private StringBuffer _preProcessSourceText(List<String> sourceTextLines) { StringBuffer javaSourceTextBuffer = new StringBuffer(); boolean commentMarked = false; for (String line : sourceTextLines) { String stripedLine = StringUtils.strip(line); if (stripedLine.startsWith("//")) { javaSourceTextBuffer.append("/* ").append(stripedLine); commentMarked = true; } else { if (commentMarked) { javaSourceTextBuffer.append("*/ ").append(stripedLine); commentMarked = false; } else { javaSourceTextBuffer.append(stripedLine); } } javaSourceTextBuffer.append("\n"); } return javaSourceTextBuffer; } private IdentifierTokens getTokens(String identifier) { for (ClassTokens ct : _classTokens) { if (ct.getIdentifier().equals(identifier)) return ct; for (String mId : ct.getMethodIdentifiers()) { MethodTokens mt = ct.getMethodTokens(mId); if (mt.getIdentifier().equals(identifier)) return mt; } } throw new IllegalArgumentException("Tokens not found for identifier - " + identifier); } //------------------------------------------------------------------------- // Overridden ASTVisitor visit methods @Override public boolean visit(TypeDeclaration node) { // Add class ClassTokens eachClassTokens = new ClassTokens(_ignoreComments, _stateAnalysis); eachClassTokens.acceptVisitor(_positionObserver); eachClassTokens.addTokens(node); _classTokens.add(eachClassTokens); return true; // continue visiting other class definitions if found } @Override public boolean visit(PackageDeclaration node) { _packageName = node.getName().getFullyQualifiedName(); return true; } @Override public boolean visit(AnnotationTypeDeclaration node) { //System.out.println("Found custom annotation type - " + node.getName().getFullyQualifiedName()); return true; } @Override public boolean visit(EnumDeclaration node) { //System.out.println("Found custom Enum type - " + node.getName().getFullyQualifiedName()); return true; } @Override public boolean visit(NormalAnnotation node) { //System.out.println("Found Annotation - " + node.getTypeName()); return true; } }