Skip to content

Commit

Permalink
Merge branch 'wr-scan-sas'
Browse files Browse the repository at this point in the history
  • Loading branch information
Maxim Moinat committed Nov 18, 2019
2 parents 89dee6b + b1b1c73 commit 18e6a3c
Show file tree
Hide file tree
Showing 5 changed files with 137 additions and 63 deletions.
5 changes: 5 additions & 0 deletions rabbit-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -198,5 +198,10 @@
<artifactId>avro</artifactId>
<version>1.8.2</version>
</dependency>
<dependency>
<groupId>com.epam</groupId>
<artifactId>parso</artifactId>
<version>2.0</version>
</dependency>
</dependencies>
</project>
14 changes: 7 additions & 7 deletions rabbit-core/src/main/java/org/ohdsi/utilities/ScanFieldName.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ public interface ScanFieldName {
String FRACTION_UNIQUE = "Fraction unique";
String SOURCE_DESCRIPTION = "Description";
String SOURCE_ALIAS = "Alias";
// String AVERAGE = "Average";
// String STDEV = "Standard Deviation";
// String MIN = "Min";
// String Q1 = "25%";
// String Q2 = "Median";
// String Q3 = "75%";
// String MAX = "Max%";
String AVERAGE = "Average";
String STDEV = "Standard Deviation";
String MIN = "Min";
String Q1 = "25%";
String Q2 = "Median";
String Q3 = "75%";
String MAX = "Max%";
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@
public class DbSettings {
public static int DATABASE = 1;
public static int CSVFILES = 2;

public static int SASFILES = 3;

public int dataType;
public List<String> tables = new ArrayList<String>();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ public class WhiteRabbitMain implements ActionListener {
private JList<String> tableList;
private Vector<String> tables = new Vector<String>();
private boolean sourceIsFiles = true;
private boolean sourceIsSas = false;
private boolean targetIsFiles = false;

private List<JComponent> componentsToDisableWhenRunning = new ArrayList<JComponent>();
Expand Down Expand Up @@ -148,6 +149,7 @@ public void windowClosing(WindowEvent e) {
}

private void launchCommandLine(String iniFileName) {
// TODO: add option to scan sas7bdat from command line, using ini file
IniFile iniFile = new IniFile(iniFileName);
DbSettings dbSettings = new DbSettings();
if (iniFile.get("DATA_TYPE").equalsIgnoreCase("Delimited text files")) {
Expand Down Expand Up @@ -266,47 +268,43 @@ public void actionPerformed(ActionEvent e) {
sourcePanel.setLayout(new GridLayout(0, 2));
sourcePanel.setBorder(BorderFactory.createTitledBorder("Source data location"));
sourcePanel.add(new JLabel("Data type"));
sourceType = new JComboBox<String>(new String[] { "Delimited text files", "MySQL", "Oracle", "SQL Server", "PostgreSQL", "MS Access", "PDW", "Redshift", "Teradata", "BigQuery" });
sourceType = new JComboBox<>(new String[] { "Delimited text files", "SAS7bdat", "MySQL", "Oracle", "SQL Server", "PostgreSQL", "MS Access", "PDW", "Redshift", "Teradata", "BigQuery" });
sourceType.setToolTipText("Select the type of source data available");
sourceType.addItemListener(new ItemListener() {

@Override
public void itemStateChanged(ItemEvent arg0) {
sourceIsFiles = arg0.getItem().toString().equals("Delimited text files");
sourceServerField.setEnabled(!sourceIsFiles);
sourceUserField.setEnabled(!sourceIsFiles);
sourcePasswordField.setEnabled(!sourceIsFiles);
sourceDatabaseField.setEnabled(!sourceIsFiles);
sourceDelimiterField.setEnabled(sourceIsFiles);
addAllButton.setEnabled(!sourceIsFiles);

if (!sourceIsFiles && arg0.getItem().toString().equals("Oracle")) {
sourceServerField
.setToolTipText("For Oracle servers this field contains the SID, servicename, and optionally the port: '<host>/<sid>', '<host>:<port>/<sid>', '<host>/<service name>', or '<host>:<port>/<service name>'");
sourceUserField.setToolTipText("For Oracle servers this field contains the name of the user used to log in");
sourcePasswordField.setToolTipText("For Oracle servers this field contains the password corresponding to the user");
sourceDatabaseField
.setToolTipText("For Oracle servers this field contains the schema (i.e. 'user' in Oracle terms) containing the source tables");
} else if (!sourceIsFiles && arg0.getItem().toString().equals("PostgreSQL")) {
sourceServerField.setToolTipText("For PostgreSQL servers this field contains the host name and database name (<host>/<database>)");
sourceType.addItemListener(itemEvent -> {
sourceIsFiles = itemEvent.getItem().toString().equals("Delimited text files");
sourceIsSas = itemEvent.getItem().toString().equals("SAS7bdat");
boolean sourceIsDatabase = !(sourceIsFiles || sourceIsSas);
sourceServerField.setEnabled(sourceIsDatabase);
sourceUserField.setEnabled(sourceIsDatabase);
sourcePasswordField.setEnabled(sourceIsDatabase);
sourceDatabaseField.setEnabled(sourceIsDatabase);
sourceDelimiterField.setEnabled(sourceIsFiles);
addAllButton.setEnabled(sourceIsDatabase);

if (sourceIsDatabase && itemEvent.getItem().toString().equals("Oracle")) {
sourceServerField.setToolTipText("For Oracle servers this field contains the SID, servicename, and optionally the port: '<host>/<sid>', '<host>:<port>/<sid>', '<host>/<service name>', or '<host>:<port>/<service name>'");
sourceUserField.setToolTipText("For Oracle servers this field contains the name of the user used to log in");
sourcePasswordField.setToolTipText("For Oracle servers this field contains the password corresponding to the user");
sourceDatabaseField.setToolTipText("For Oracle servers this field contains the schema (i.e. 'user' in Oracle terms) containing the source tables");
} else if (sourceIsDatabase && itemEvent.getItem().toString().equals("PostgreSQL")) {
sourceServerField.setToolTipText("For PostgreSQL servers this field contains the host name and database name (<host>/<database>)");
sourceUserField.setToolTipText("The user used to log in to the server");
sourcePasswordField.setToolTipText("The password used to log in to the server");
sourceDatabaseField.setToolTipText("For PostgreSQL servers this field contains the schema containing the source tables");
} else if (sourceIsDatabase && itemEvent.getItem().toString().equals("BigQuery")) {
sourceServerField.setToolTipText("GBQ SA & UA: ProjectID");
sourceUserField.setToolTipText("GBQ SA only: OAuthServiceAccountEMAIL");
sourcePasswordField.setToolTipText("GBQ SA only: OAuthPvtKeyPath");
sourceDatabaseField.setToolTipText("GBQ SA & UA: Data Set within ProjectID");
} else if (sourceIsDatabase) {
sourceServerField.setToolTipText("This field contains the name or IP address of the database server");
if (itemEvent.getItem().toString().equals("SQL Server")) {
sourceUserField.setToolTipText("The user used to log in to the server. Optionally, the domain can be specified as <domain>/<user> (e.g. 'MyDomain/Joe')");
} else {
sourceUserField.setToolTipText("The user used to log in to the server");
sourcePasswordField.setToolTipText("The password used to log in to the server");
sourceDatabaseField.setToolTipText("For PostgreSQL servers this field contains the schema containing the source tables");
} else if (!sourceIsFiles && arg0.getItem().toString().equals("BigQuery")) {
sourceServerField.setToolTipText("GBQ SA & UA: ProjectID");
sourceUserField.setToolTipText("GBQ SA only: OAuthServiceAccountEMAIL");
sourcePasswordField.setToolTipText("GBQ SA only: OAuthPvtKeyPath");
sourceDatabaseField.setToolTipText("GBQ SA & UA: Data Set within ProjectID");
} else if (!sourceIsFiles) {
sourceServerField.setToolTipText("This field contains the name or IP address of the database server");
if (arg0.getItem().toString().equals("SQL Server"))
sourceUserField
.setToolTipText("The user used to log in to the server. Optionally, the domain can be specified as <domain>/<user> (e.g. 'MyDomain/Joe')");
else
sourceUserField.setToolTipText("The user used to log in to the server");
sourcePasswordField.setToolTipText("The password used to log in to the server");
sourceDatabaseField.setToolTipText("The name of the database containing the source tables");
}
sourcePasswordField.setToolTipText("The password used to log in to the server");
sourceDatabaseField.setToolTipText("The name of the database containing the source tables");
}
});
sourcePanel.add(sourceType);
Expand Down Expand Up @@ -469,6 +467,7 @@ public void actionPerformed(ActionEvent e) {
}

private JPanel createFakeDataPanel() {
// TODO: add sas7bdat as target for fake data.
JPanel panel = new JPanel();

panel.setLayout(new GridBagLayout());
Expand Down Expand Up @@ -694,12 +693,16 @@ private void addAllTables() {
private void pickTables() {
DbSettings sourceDbSettings = getSourceDbSettings();
if (sourceDbSettings != null) {
if (sourceDbSettings.dataType == DbSettings.CSVFILES) {
if (sourceDbSettings.dataType == DbSettings.CSVFILES || sourceDbSettings.dataType == DbSettings.SASFILES) {
JFileChooser fileChooser = new JFileChooser(new File(folderField.getText()));
fileChooser.setMultiSelectionEnabled(true);
fileChooser.setFileSelectionMode(JFileChooser.FILES_ONLY);
FileNameExtensionFilter filter = new FileNameExtensionFilter("Delimited text files", "csv", "txt");
fileChooser.setFileFilter(filter);

if (sourceDbSettings.dataType == DbSettings.CSVFILES) {
fileChooser.setFileFilter(new FileNameExtensionFilter("Delimited text files", "csv", "txt"));
} else if (sourceDbSettings.dataType == DbSettings.SASFILES) {
fileChooser.setFileFilter(new FileNameExtensionFilter("SAS Data Files", "sas7bdat"));
}

int returnVal = fileChooser.showDialog(frame, "Select tables");
if (returnVal == JFileChooser.APPROVE_OPTION) {
Expand Down Expand Up @@ -746,6 +749,8 @@ private DbSettings getSourceDbSettings() {
dbSettings.delimiter = '\t';
else
dbSettings.delimiter = sourceDelimiterField.getText().charAt(0);
} else if (sourceType.getSelectedItem().equals("SAS7bdat")) {
dbSettings.dataType = DbSettings.SASFILES;
} else {
dbSettings.dataType = DbSettings.DATABASE;
dbSettings.user = sourceUserField.getText();
Expand Down Expand Up @@ -789,7 +794,7 @@ else if (sourceType.getSelectedItem().toString().equals("Teradata"))
}

private void testConnection(DbSettings dbSettings) {
if (dbSettings.dataType == DbSettings.CSVFILES) {
if (dbSettings.dataType == DbSettings.CSVFILES || dbSettings.dataType == DbSettings.SASFILES) {
if (new File(folderField.getText()).exists()) {
String message = "Folder " + folderField.getText() + " found";
JOptionPane.showMessageDialog(frame, StringUtilities.wordWrap(message, 80), "Working folder found", JOptionPane.INFORMATION_MESSAGE);
Expand Down Expand Up @@ -902,7 +907,7 @@ else if (sourceType.getSelectedItem().toString().equals("SQL Server")) {

private void scanRun() {
if (tables.size() == 0) {
if (sourceIsFiles) {
if (sourceIsFiles || sourceIsSas) {
String message = "No files selected for scanning";
JOptionPane.showMessageDialog(frame, StringUtilities.wordWrap(message, 80), "No files selected", JOptionPane.ERROR_MESSAGE);
return;
Expand Down Expand Up @@ -971,7 +976,7 @@ public void run() {
DbSettings dbSettings = getSourceDbSettings();
if (dbSettings != null) {
for (String table : tables) {
if (dbSettings.dataType == DbSettings.CSVFILES)
if (dbSettings.dataType == DbSettings.CSVFILES || dbSettings.dataType == DbSettings.SASFILES)
table = folderField.getText() + "/" + table;
dbSettings.tables.add(table);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package org.ohdsi.whiteRabbit.scan;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.sql.ResultSet;
Expand All @@ -31,6 +32,10 @@
import java.util.function.Function;
import java.util.stream.Collectors;

import com.epam.parso.Column;
import com.epam.parso.SasFileProperties;
import com.epam.parso.SasFileReader;
import com.epam.parso.impl.SasFileReaderImpl;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellStyle;
import org.apache.poi.ss.usermodel.Row;
Expand Down Expand Up @@ -73,16 +78,20 @@ public void process(DbSettings dbSettings, int sampleSize, boolean scanValues, i
if (!scanValues)
this.minCellCount = Math.max(minCellCount, MIN_CELL_COUNT_FOR_CSV);
tableToFieldInfos = processCsvFiles(dbSettings);
} else
} else if (dbSettings.dataType == DbSettings.SASFILES) {
tableToFieldInfos = processSasFiles(dbSettings);
} else {
tableToFieldInfos = processDatabase(dbSettings);
}
generateReport(tableToFieldInfos, filename);
}

private Map<String, List<FieldInfo>> processDatabase(DbSettings dbSettings) {
// GBQ requires database. Put database value into domain var
if (dbSettings.dbType == DbType.BIGQUERY) {
// GBQ requires database. Put database value into domain var
dbSettings.domain = dbSettings.database;
};

try (RichConnection connection = new RichConnection(dbSettings.server, dbSettings.domain, dbSettings.user, dbSettings.password, dbSettings.dbType)) {
connection.setVerbose(false);
connection.use(dbSettings.database);
Expand All @@ -92,13 +101,12 @@ private Map<String, List<FieldInfo>> processDatabase(DbSettings dbSettings) {

return dbSettings.tables.stream()
.collect(Collectors.toMap(Function.identity(), table -> processDatabaseTable(table, connection)));

}
}

private Map<String, List<FieldInfo>> processCsvFiles(DbSettings dbSettings) {
delimiter = dbSettings.delimiter;
Map<String, List<FieldInfo>> tableToFieldInfos = new HashMap<String, List<FieldInfo>>();
Map<String, List<FieldInfo>> tableToFieldInfos = new HashMap<>();
for (String table : dbSettings.tables) {
List<FieldInfo> fieldInfos = processCsvFile(table);
String tableName = new File(table).getName();
Expand All @@ -107,7 +115,20 @@ private Map<String, List<FieldInfo>> processCsvFiles(DbSettings dbSettings) {
} else {
tableToFieldInfos.put(table, fieldInfos);
}
}
return tableToFieldInfos;
}

private Map<String, List<FieldInfo>> processSasFiles(DbSettings dbSettings) {
Map<String, List<FieldInfo>> tableToFieldInfos = new HashMap<>();
for (String fileName : dbSettings.tables) {
List<FieldInfo> fieldInfos = processSasFile(fileName);
String tableName = new File(fileName).getName();
if (!tableToFieldInfos.containsKey(tableName)) {
tableToFieldInfos.put(tableName, fieldInfos);
} else {
tableToFieldInfos.put(fileName, fieldInfos);
}
}
return tableToFieldInfos;
}
Expand All @@ -128,15 +149,17 @@ private void generateReport(Map<String, List<FieldInfo>> tableToFieldInfos, Stri
addRow(overviewSheet, ScanFieldName.TABLE, ScanFieldName.FIELD, ScanFieldName.TYPE, ScanFieldName.N_ROWS);
for (String table : tables) {
for (FieldInfo fieldInfo : tableToFieldInfos.get(table)) {
addRow(overviewSheet, table, fieldInfo.name, fieldInfo.getTypeDescription(), Long.valueOf(fieldInfo.rowCount));
}
addRow(overviewSheet, table, fieldInfo.name, fieldInfo.getTypeDescription(), Long.valueOf(fieldInfo.rowCount));
}
addRow(overviewSheet, "");
}
} else {
addRow(overviewSheet,
ScanFieldName.TABLE, ScanFieldName.FIELD, ScanFieldName.TYPE, ScanFieldName.MAX_LENGTH,
ScanFieldName.N_ROWS, ScanFieldName.N_ROWS_CHECKED, ScanFieldName.FRACTION_EMPTY,
ScanFieldName.UNIQUE_COUNT, ScanFieldName.FRACTION_UNIQUE
ScanFieldName.UNIQUE_COUNT, ScanFieldName.FRACTION_UNIQUE,
ScanFieldName.AVERAGE, ScanFieldName.STDEV,
ScanFieldName.MIN, ScanFieldName.Q1, ScanFieldName.Q2, ScanFieldName.Q3, ScanFieldName.MAX
);
int sheetIndex = 0;
Map<String, String> sheetNameLookup = new HashMap<>();
Expand All @@ -150,17 +173,17 @@ private void generateReport(Map<String, List<FieldInfo>> tableToFieldInfos, Stri
for (FieldInfo fieldInfo : tableToFieldInfos.get(tableName)) {
Long uniqueCount = fieldInfo.uniqueCount;
Double fractionUnique = fieldInfo.getFractionUnique();
addRow(overviewSheet, tableNameIndexed, fieldInfo.name, fieldInfo.getTypeDescription(),
addRow(overviewSheet, tableNameIndexed, fieldInfo.name, fieldInfo.getTypeDescription(),
Integer.valueOf(fieldInfo.maxLength),
Long.valueOf(fieldInfo.rowCount),
Long.valueOf(fieldInfo.nProcessed),
Long.valueOf(fieldInfo.nProcessed),
fieldInfo.getFractionEmpty(),
fieldInfo.hasValuesTrimmed() ? String.format("<= %d", uniqueCount) : uniqueCount,
fieldInfo.hasValuesTrimmed() ? String.format("<= %.3f", fractionUnique) : fractionUnique,
fieldInfo.mean, fieldInfo.stdev, fieldInfo.min, fieldInfo.q1, fieldInfo.q2, fieldInfo.q3, fieldInfo.max
);
this.setCellStyles(overviewSheet, percentageStyle, 6, 8);
}
}
addRow(overviewSheet, "");
sheetIndex += 1;
}
Expand Down Expand Up @@ -353,7 +376,7 @@ else if (dbType == DbType.BIGQUERY) {

private List<FieldInfo> processCsvFile(String filename) {
StringUtilities.outputWithTime("Scanning table " + filename);
List<FieldInfo> fieldInfos = new ArrayList<FieldInfo>();
List<FieldInfo> fieldInfos = new ArrayList<>();
int lineNr = 0;
for (String line : new ReadTextFile(filename)) {
lineNr++;
Expand Down Expand Up @@ -383,6 +406,46 @@ private List<FieldInfo> processCsvFile(String filename) {
return fieldInfos;
}

private List<FieldInfo> processSasFile(String filename) {
StringUtilities.outputWithTime("Scanning table " + filename);
List<FieldInfo> fieldInfos = new ArrayList<>();

// TODO: try with resources and print warning on exception
FileInputStream inputStream;
try {
inputStream = new FileInputStream(new File(filename));

SasFileReader sasFileReader = new SasFileReaderImpl(inputStream);

// TODO: retrieve more information from the sasFileProperties, like data type and length.
SasFileProperties sasFileProperties = sasFileReader.getSasFileProperties();
for (Column column : sasFileReader.getColumns()) {
fieldInfos.add(new FieldInfo(column.getName()));
}

for (int i = 0; i < sasFileProperties.getRowCount(); i++) {
Object[] row = sasFileReader.readNext();

if (row.length == fieldInfos.size()) { // Else there appears to be a formatting error, so skip
for (int j = 0; j < row.length; j++) {
fieldInfos.get(j).processValue(row[j] == null ? "" : row[j].toString());
}
}
if (sampleSize != -1 && i == sampleSize)
break;
}
inputStream.close();
} catch (IOException e) {
e.printStackTrace();
}

for (FieldInfo fieldInfo : fieldInfos) {
fieldInfo.wrapUp();
}

return fieldInfos;
}

private class FieldInfo {
public String type;
public String name;
Expand Down

0 comments on commit 18e6a3c

Please sign in to comment.