Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add processing for ICLR-GREAT Paper and Fix bugs #8

Merged
merged 14 commits into from
Aug 25, 2020
Merged
34 changes: 34 additions & 0 deletions fixture/representation/Enum.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
* Code from Apache
*/
package org.apache.cassandra.gms;

public enum Enum
{
STATUS,
LOAD,
SCHEMA,
DC,
RACK,
RELEASE_VERSION,
REMOVAL_COORDINATOR,
INTERNAL_IP,
RPC_ADDRESS,
X_11_PADDING, // padding specifically for 1.1
SEVERITY,
NET_VERSION,
HOST_ID,
TOKENS,
RPC_READY,
// pad to allow adding new states to existing cluster
X1,
X2,
X3,
X4,
X5,
X6,
X7,
X8,
X9,
X10,
}
32 changes: 32 additions & 0 deletions fixture/representation/UnparsableCode.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
class UnparsableCode {
static Object
[][][][][][][][][][]
[][][][][][][][][][]
[][][][][][][][][][]
[][][][][][][][][][]
[][][][][][][][][][]
[][][][][][][][][][]
[][][][][][][][][][]
[][][][][][][][][][]
[][][][][][][][][][]
[][][][][][][][][][]
[][][][][][][][][][]
[][][][][][][][][][]
[][][][][][][][][][]
[][][][][][][][][][]
[][][][][][][][][][]
[][][][][][][][][][]
[][][][][][][][][][]
[][][][][][][][][][]
[][][][][][][][][][]
[][][][][][][][][][]
[][][][][][][][][][]
[][][][][][][][][][]
[][][][][][][][][][]
[][][][][][][][][][]
[][][][][][][][][][]
[][][][][][] o; // 256 = too many

public static void main(String[] args) {
}
}
7 changes: 7 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,13 @@
<version>3.17.0</version>
</dependency>

<!-- https://mvnrepository.com/artifact/com.google.code.gson/gson -->
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.5</version>
</dependency>

</dependencies>

<build>
Expand Down
104 changes: 104 additions & 0 deletions src/main/java/com/github/sergdelft/j2graph/DataGenerator.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
package com.github.sergdelft.j2graph;
mauricioaniche marked this conversation as resolved.
Show resolved Hide resolved

import com.github.sergdelft.j2graph.ast.JDT;
import com.github.sergdelft.j2graph.graph.*;
import com.github.sergdelft.j2graph.walker.GraphWalker;
import com.github.sergdelft.j2graph.walker.json.JsonVisitor;
import com.google.gson.JsonObject;
import org.apache.commons.lang3.tuple.Pair;

import java.io.*;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;


/**
* Generator for ICLR20-Great data format: https://github.com/VHellendoorn/ICLR20-Great
* <p>
* Also generates token vocabulary for training data (no BPE version).
*/
public class DataGenerator {

enum Split {
TRAIN,
DEV,
EVAL
}

public void run() {
try {
iterateFiles("path/to/folder/containing/java/files/for/train", Split.TRAIN);
iterateFiles("path/to/folder/containing/java/files/for/train/validation", Split.DEV);
iterateFiles("path/to/folder/containing/java/files/for/test", Split.EVAL);
} catch (IOException e) {
e.printStackTrace();
}
}

private void iterateFiles(String path, Split split) throws IOException {
System.out.println("Starting to preprocess files for " + split.name().toLowerCase());

BufferedOutputStream processedDataStream = new BufferedOutputStream(new FileOutputStream(split.name().toLowerCase() + ".txt"));
PrintWriter processedDataWriter = new PrintWriter(processedDataStream, true, StandardCharsets.UTF_8);

BufferedOutputStream vocabStream = new BufferedOutputStream(new FileOutputStream(split.name().toLowerCase() + "_vocab.txt"));
PrintWriter vocabWriter = new PrintWriter(vocabStream, true, StandardCharsets.UTF_8);

Files.walk(Paths.get(path))
.filter(Files::isRegularFile)
.forEach(filePath -> processFile(split, processedDataWriter, vocabWriter, filePath));

processedDataWriter.close();
processedDataStream.close();
vocabWriter.close();
vocabStream.close();
}

private void processFile(Split split, PrintWriter processedDataWriter, PrintWriter vocabWriter, Path filePath) {
GraphWalker graphWalker = new GraphWalker();
String sourceCode = loadSourceCode(filePath.toString());
try {
ClassGraph graph = new JDT().parse(sourceCode);
if (graph != null) {
JsonVisitor jsonVisitor = new JsonVisitor();
graphWalker.accept(graph, jsonVisitor);
if (split.equals(Split.TRAIN) && !jsonVisitor.getCorrectAndBuggyPairs().isEmpty()) {
saveTokensToFile(vocabWriter, graph);
}
for (Pair<JsonObject, JsonObject> pair : jsonVisitor.getCorrectAndBuggyPairs()) {
processedDataWriter.println(pair.getLeft());
processedDataWriter.println(pair.getRight());
processedDataWriter.flush();
}
}
} catch (IllegalArgumentException e) {
System.out.println("Couldn't parse code. Ignoring and continuing...");
}


}

protected String loadSourceCode(String fixture) {
try {
return new String (Files.readAllBytes(Paths.get(fixture)));
} catch(Exception e) {
throw new RuntimeException(e);
}
}

private void saveTokensToFile(PrintWriter vocabWriter, ClassGraph graph) {
for (MethodGraph methodGraph : graph.getMethods()) {
vocabWriter.println("");
methodGraph.getTokens().forEach(t -> vocabWriter.print(t.getTokenName() + " "));
vocabWriter.println("");
methodGraph.getSymbols().forEach(s -> vocabWriter.print(s.getSymbol() + " "));
vocabWriter.println("");
methodGraph.getVocabulary().forEach(v -> vocabWriter.print(v.getWord() + " "));
vocabWriter.println("");
methodGraph.getNonTerminals().forEach(nt -> vocabWriter.print(nt.getName() + " "));
vocabWriter.flush();
}
}
}
9 changes: 9 additions & 0 deletions src/main/java/com/github/sergdelft/j2graph/Main.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
package com.github.sergdelft.j2graph;

public class Main {

public static void main(String[] args) {
DataGenerator dataGenerator = new DataGenerator();
dataGenerator.run();
}
}
Loading