From 229c91c073a7d7f1591ee3c958bc7347ab9daa33 Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Tue, 26 Apr 2022 13:41:37 +0000
Subject: [PATCH 01/37] porting some of the for loops to hdfs

---
 .../dependency_untangling/compress_files.sh   |  21 +++
 .../dependency_untangling/encrypt_files.sh    |  20 +++
 .../dependency_untangling/genomics.sh         |  42 +++++
 .../dependency_untangling/img_convert.sh      |  12 ++
 .../input/install-deps.sh                     |  34 ++++
 .../dependency_untangling/input/packages      | 150 ++++++++++++++++
 .../dependency_untangling/input/setup.sh      | 160 ++++++++++++++++++
 .../dependency_untangling/nginx.sh            |  36 ++++
 .../dependency_untangling/pacaur.sh           |  42 +++++
 .../dependency_untangling/pcap.sh             |  25 +++
 .../dependency_untangling/proginf.sh          |  18 ++
 .../dependency_untangling/run.distr.sh        |  90 ++++++++++
 .../dependency_untangling/to_mp3.sh           |  21 +++
 13 files changed, 671 insertions(+)
 create mode 100755 evaluation/distr_benchmarks/dependency_untangling/compress_files.sh
 create mode 100755 evaluation/distr_benchmarks/dependency_untangling/encrypt_files.sh
 create mode 100755 evaluation/distr_benchmarks/dependency_untangling/genomics.sh
 create mode 100755 evaluation/distr_benchmarks/dependency_untangling/img_convert.sh
 create mode 100755 evaluation/distr_benchmarks/dependency_untangling/input/install-deps.sh
 create mode 100644 evaluation/distr_benchmarks/dependency_untangling/input/packages
 create mode 100755 evaluation/distr_benchmarks/dependency_untangling/input/setup.sh
 create mode 100755 evaluation/distr_benchmarks/dependency_untangling/nginx.sh
 create mode 100755 evaluation/distr_benchmarks/dependency_untangling/pacaur.sh
 create mode 100755 evaluation/distr_benchmarks/dependency_untangling/pcap.sh
 create mode 100755 evaluation/distr_benchmarks/dependency_untangling/proginf.sh
 create mode 100755 evaluation/distr_benchmarks/dependency_untangling/run.distr.sh
 create mode 100755 evaluation/distr_benchmarks/dependency_untangling/to_mp3.sh

diff --git a/evaluation/distr_benchmarks/dependency_untangling/compress_files.sh b/evaluation/distr_benchmarks/dependency_untangling/compress_files.sh
new file mode 100755
index 000000000..652ce1969
--- /dev/null
+++ b/evaluation/distr_benchmarks/dependency_untangling/compress_files.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# compress all files in a directory
+IN=${IN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/pcap_data/}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/compress}
+LOGS=${OUT}/logs
+mkdir -p ${OUT}/logs
+run_tests() {
+    name=$(basename $1).zip
+    zip -r ${OUT}/$name $1
+}
+
+export -f run_tests
+
+pkg_count=0
+for item in ${IN}/*;
+do
+    pkg_count=$((pkg_count + 1));
+    run_tests $item > "${LOGS}"/"$pkg_count.log"
+done
+
+echo 'done';
diff --git a/evaluation/distr_benchmarks/dependency_untangling/encrypt_files.sh b/evaluation/distr_benchmarks/dependency_untangling/encrypt_files.sh
new file mode 100755
index 000000000..421732513
--- /dev/null
+++ b/evaluation/distr_benchmarks/dependency_untangling/encrypt_files.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+# encrypt all files in a directory 
+IN=${IN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/pcap_data}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/encrypt}
+LOGS=${OUT}/logs
+mkdir -p ${LOGS}
+run_tests() {
+    openssl enc -aes-256-cbc -pbkdf2 -iter 20000 -in $1 -out $OUT/$(basename $1).enc -k 'key'
+}
+
+export -f run_tests
+pkg_count=0
+
+for item in ${IN}/*;
+do
+    pkg_count=$((pkg_count + 1));
+    run_tests $item > ${LOGS}/${pkg_count}.log
+done
+
+echo 'done';
diff --git a/evaluation/distr_benchmarks/dependency_untangling/genomics.sh b/evaluation/distr_benchmarks/dependency_untangling/genomics.sh
new file mode 100755
index 000000000..e5af9c9b9
--- /dev/null
+++ b/evaluation/distr_benchmarks/dependency_untangling/genomics.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# create bam files with regions
+################### 1KG SAMPLES
+IN=${IN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input}
+SAMTOOLS_BIN=${IN}/deps/samtools-1.7/samtools
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/bio}
+LOGS=${OUT}/logs
+IN_NAME=${IN}/bio/100G.txt
+GENE_LOCS=${IN}/bio/Gene_locs.txt
+mkdir -p ${LOGS}
+run_tests() {
+    s_line=$(echo $1 | tr '@' ' ')
+    pop=$(echo $s_line |cut -f 1 -d " ");
+    sample=$(echo $s_line |cut -d " " -f 2);
+    link=$(echo $s_line |cut -f 3 -d " ");
+    ### correcting labeling of chromosomes so that all are 1,2,3.. instead of chr1,chr2 or chromosome1 etc
+    echo 'Processing Sample '${IN}/bio/$sample' ';
+    # uniform the chromosomes in the file due to inconsistencies
+    $SAMTOOLS_BIN view -H "${IN}/bio/$sample".bam | sed -e 's/SN:\([0-9XY]\)/SN:chr\1/' -e 's/SN:MT/SN:chrM/' \
+        | $SAMTOOLS_BIN reheader - "${IN}/bio/$sample".bam > "${OUT}/$sample"_corrected.bam  2> /dev/null
+    # create bai file 
+    $SAMTOOLS_BIN index -b "${OUT}/$sample"_corrected.bam 2> /dev/null
+    ### Isolating each relevant chromosome based on Gen_locs
+    cut -f 2 ${IN}/bio/Gene_locs.txt |sort |uniq |while read chr;
+    do  
+        echo 'Isolating Chromosome '$chr' from sample '${OUT}/$sample',  ';
+        $SAMTOOLS_BIN view -b "${OUT}/$sample"_corrected.bam chr"$chr" > "${OUT}/$pop"_"$sample"_"$chr".bam 2> /dev/null
+        echo 'Indexing Sample '$pop'_'${OUT}/$sample' ';
+        $SAMTOOLS_BIN index -b "${OUT}/$pop"_"$sample"_"$chr".bam 2> /dev/null
+    done;
+}
+
+export -f run_tests
+data=$(cat ${IN_NAME} | tr ' ' '@')
+pkg_count=0
+for item in $data;
+do
+    pkg_count=$((pkg_count + 1));
+    run_tests $item > "${LOGS}"/"${pkg_count}.log"
+done
+
+echo 'done';
diff --git a/evaluation/distr_benchmarks/dependency_untangling/img_convert.sh b/evaluation/distr_benchmarks/dependency_untangling/img_convert.sh
new file mode 100755
index 000000000..2b87d0528
--- /dev/null
+++ b/evaluation/distr_benchmarks/dependency_untangling/img_convert.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+# tag: resize image 
+IN=${JPG:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/jpg}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/jpg}
+mkdir -p ${OUT}
+for i in $IN/*.jpg; 
+do 
+    out=$OUT/$(basename -- $i)
+    convert -resize 70% "$i" "$out"; 
+done
+
+echo 'done';
diff --git a/evaluation/distr_benchmarks/dependency_untangling/input/install-deps.sh b/evaluation/distr_benchmarks/dependency_untangling/input/install-deps.sh
new file mode 100755
index 000000000..4cb9e845a
--- /dev/null
+++ b/evaluation/distr_benchmarks/dependency_untangling/input/install-deps.sh
@@ -0,0 +1,34 @@
+IN=$PASH_TOP/evaluation/benchmarks/dependency_untangling/input/
+mkdir -p ${IN}/deps/
+# install dependencies
+pkgs='ffmpeg unrtf imagemagick libarchive-tools zstd liblzma-dev libbz2-dev zip unzip nodejs tcpdump'
+
+if ! dpkg -s $pkgs >/dev/null 2>&1 ; then
+    sudo apt-get install $pkgs -y
+    echo 'Packages Installed'
+fi
+
+if [ ! -d ${IN}/deps/samtools-1.7 ]; then
+    cd ${IN}/deps/
+    wget https://github.com/samtools/samtools/archive/refs/tags/1.7.zip
+    unzip 1.7.zip
+    rm 1.7.zip
+    cd samtools-1.7
+    wget https://github.com/samtools/htslib/archive/refs/tags/1.7.zip
+    unzip 1.7.zip
+    autoheader            # Build config.h.in (this may generate a warning about
+    # AC_CONFIG_SUBDIRS - please ignore it).
+    autoconf -Wno-syntax  # Generate the configure script
+    ./configure           # Needed for choosing optional functionality
+    make
+    rm -rf 1.7.zip
+    echo 'Samtools installed'
+fi
+
+if [ ! -f ${IN}/deps/makedeb.deb ]; then
+    cd ${IN}/deps/
+    wget http://pac-n4.csail.mit.edu:81/pash_data/makedeb.deb
+    sudo dpkg -i makedeb.deb
+    echo 'Makedeb installed'
+fi
+
diff --git a/evaluation/distr_benchmarks/dependency_untangling/input/packages b/evaluation/distr_benchmarks/dependency_untangling/input/packages
new file mode 100644
index 000000000..3d2fb08a6
--- /dev/null
+++ b/evaluation/distr_benchmarks/dependency_untangling/input/packages
@@ -0,0 +1,150 @@
+w3watch
+cant
+jzip
+zork1
+zork2
+zork3
+atari-adventure
+eclipse-subclipse
+wallpaper-lightning
+squirrelmail
+atari-bowling
+atari-breakout
+atari-combat
+atari-space-invaders
+cdm-git
+gtk-gnutella
+roundcubemail-plugin-chbox
+roundcubemail-plugin-jquery-mobile
+roundcubemail-plugin-mobile
+eclipse-svnkit
+eclipse-dltk-core
+eclipse-dltk-javascript
+eclipse-antlr-runtime
+eclipse-dltk-shelled
+eclipse-linuxtools
+eclipse-dltk-python
+eclipse-antlr4-runtime
+eclipse-jsonedit
+eclipse-goclipse
+adwaita-dark-darose
+roundcubemail-plugin-keyboard-shortcuts-ng
+refind-theme-tux-git
+refind-theme-metro-git
+ggmud-svn
+libiriverdb
+griver
+fsv2
+vecx-git
+lib32-glib
+lib32-gtk
+qjoypad
+yumbootstrap-git
+nesasm-git
+yum-metadata-parser
+libretro-fmsx-git
+projectm-git
+papu-vst-git
+rp2a03-vst-git
+sn76489-vst-git
+ggmud
+bpm-git
+kodi-addon-vfs-rar
+eduke32-git
+voidsw-git
+kodi-addon-vfs-rar-git
+rottexpr-shareware-git
+bubblemon
+gno3dtet
+tutka
+netpanzer
+stratagus
+scourge
+lives
+drqueue
+cytadela
+bitefusion
+globs-svn
+ri-li
+globs-benchmarks-svn
+dunelegacy
+eternallands-sound
+getlive
+shadermaker
+csl
+brother-dcp350c
+pacstats-hg
+pacstats
+tracy
+eternallands-music
+cal3d-svn
+tracy-git
+ncine-git
+ncpong-git
+ncparticleeditor-git
+ncinvaders-git
+ncine
+ncline-git
+eternallands
+nctracer-git
+nctiledviewer-git
+spookyghost-git
+postgresqltuner
+luniistore
+unscd
+netatop
+scamper
+graylog
+bitcoinxt
+prysm
+python-pysword
+python2-pysword
+mp3rename
+plc
+python3-sensors-git
+python2-bencode
+python2-binplist
+python2-dfvfs
+python2-dfwinreg
+python2-artifacts
+python2-pytsk3
+python2-libbde
+python2-libesedb
+python2-libevt
+python2-libevtx
+python2-libfsntfs
+python2-libfwsi
+python2-liblnk
+python2-libmsiecf
+python2-libolecf
+python2-libqcow
+python2-libregf
+python2-libscca
+python2-libsigscan
+python2-libsmdev
+python2-libsmraw
+python2-libvhdi
+python2-libvmdk
+python2-libvshadow
+python2-libewf
+python2-dfdatetime
+libvhdi
+python2-acora
+python2-efilter
+audioconvert
+gogglesmm-git
+gogglesmm-develop-git
+hangupsbot
+python-quamash-git
+python-reparser
+pwgen-passphrase
+macchiato-git
+ft232r_prog
+gr-dsd-git
+rtl_power_fftw-git
+python2-pyrtlsdr-git
+python-pyrtlsdr-git
+csdr-git
+gr-dab-git
+rtlsdr-scanner-git
+shinysdr-git
diff --git a/evaluation/distr_benchmarks/dependency_untangling/input/setup.sh b/evaluation/distr_benchmarks/dependency_untangling/input/setup.sh
new file mode 100755
index 000000000..58ee4bd7d
--- /dev/null
+++ b/evaluation/distr_benchmarks/dependency_untangling/input/setup.sh
@@ -0,0 +1,160 @@
+#!/bin/bash
+
+# exit when any command fails
+#set -e
+
+IN=$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/
+OUT=$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/output/
+IN_NAME=$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/100G.txt
+
+if [ "$1" == "-c" ]; then
+    rm -rf ${IN}/jpg
+    rm -rf ${IN}/log_data
+    rm -rf ${IN}/wav
+    rm -rf ${IN}/nginx-logs
+    rm -rf ${IN}/node_modules
+    rm -rf ${IN}/pcap_data
+    rm -rf ${IN}/pcaps
+    rm -rf ${IN}/packages
+    rm -rf ${IN}/mir-sa
+    rm -rf ${IN}/deps
+    rm -rf ${IN}/bio
+    rm -rf ${IN}/output
+    rm -rf ${OUT}
+    exit 
+fi
+
+setup_dataset() {
+  if [ "$1" == "--small" ]; then
+      LOG_DATA_FILES=6
+      WAV_DATA_FILES=20
+      NODE_MODULE_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/node_modules.zip
+      BIO_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/bio.zip
+      JPG_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/jpg.zip
+      PCAP_DATA_FILES=1
+  else
+      LOG_DATA_FILES=84
+      WAV_DATA_FILES=120
+      NODE_MODULE_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/node_modules.zip
+      BIO_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/bio.zip
+      JPG_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/jpg.zip
+      PCAP_DATA_FILES=15
+  fi
+  
+  if [ ! -d ${IN}/wav ]; then
+      wget http://pac-n4.csail.mit.edu:81/pash_data/wav.zip
+      unzip wav.zip && cd wav/
+      for f in *.wav; do
+          FILE=$(basename "$f")
+          for (( i = 0; i <= $WAV_DATA_FILES; i++)) do
+              echo copying to $f$i.wav
+              cp $f $f$i.wav
+          done
+      done
+      echo "WAV Generated"
+  fi
+  
+  if [ ! -d ${IN}/jpg ]; then
+      cd ${IN}
+      wget $JPG_DATA_LINK
+      unzip jpg.zip
+      echo "JPG Generated"
+      rm -rf ${IN}/jpg.zip
+  fi
+  
+  # download the input for the nginx logs and populate the dataset
+  if [ ! -d ${IN}/log_data ]; then
+      cd $IN
+      wget http://pac-n4.csail.mit.edu:81/pash_data/nginx.zip
+      unzip nginx.zip 
+      rm nginx.zip
+      # generating analysis logs
+      mkdir -p ${IN}/log_data
+      for (( i = 1; i <=$LOG_DATA_FILES; i++)) do
+          for j in nginx-logs/*;do
+              n=$(basename $j)
+              cat $j > log_data/log${i}_${n}.log; 
+          done
+      done
+      echo "Logs Generated"
+  fi
+  
+  if [ ! -d ${IN}/bio ]; then                                                  
+      if [ "$1" = "--small" ]; then
+          # download the Genome loc file
+          wget $BIO_DATA_LINK 
+          unzip bio.zip
+          cd bio
+          wget http://pac-n4.csail.mit.edu:81/pash_data/Gene_locs.txt
+          wget http://pac-n4.csail.mit.edu:81/pash_data/small/100G.txt
+          cd ..
+          rm bio.zip
+      else
+          mkdir ${IN}/bio                                                          
+          cd ${IN}/bio                                                             
+          # download the file containing the links for the dataset                 
+          wget http://pac-n4.csail.mit.edu:81/pash_data/100G.txt                   
+          # download the Genome loc file                                           
+          wget http://pac-n4.csail.mit.edu:81/pash_data/Gene_locs.txt              
+          # start downloading the real dataset                                     
+          cat ${IN_NAME} |while read s_line;                                       
+      do                                                                       
+          echo ${IN_NAME}                                                      
+          sample=$(echo $s_line |cut -d " " -f 2);                             
+          if [[ ! -f $sample ]]; then                                          
+              pop=$(echo $s_line |cut -f 1 -d " ");                            
+              link=$(echo $s_line |cut -f 3 -d " ");                           
+              wget -O "$sample".bam  "$link"; ##this part can be adjusted maybe
+          fi                                                                   
+      done;    
+      fi                                                                           
+      echo "Genome data downloaded"
+  fi                                                                           
+  
+  # download the initial pcaps to populate the whole dataset
+  if [ ! -d ${IN}/pcap_data ]; then
+      cd $IN
+      wget http://pac-n4.csail.mit.edu:81/pash_data/pcaps.zip
+      unzip pcaps.zip
+      rm pcaps.zip
+      mkdir ${IN}/pcap_data/
+      # generates 20G
+      for (( i = 1; i <= $PCAP_DATA_FILES; i++ )) do
+          for j in ${IN}/pcaps/*;do
+              n=$(basename $j)
+              cat $j > pcap_data/pcap${i}_${n}; 
+          done
+      done
+      echo "Pcaps Generated"
+  fi 
+  
+  # download the modules for the Mir static analyses
+  if [ ! -d ${IN}/node_modules ]; then
+      cd $IN
+      wget $NODE_MODULE_LINK
+      unzip node_modules.zip 
+      rm node_modules.zip
+      # download the specific mir version
+      wget http://pac-n4.csail.mit.edu:81/pash_data/mir-sa.zip
+      unzip mir-sa.zip
+      rm mir-sa.zip
+      echo "Node modules generated"
+  fi
+  
+  # download the packages for the package building
+  if [ ! -f ${IN}/packages ]; then
+      cd $IN
+      wget http://pac-n4.csail.mit.edu:81/pash_data/packages
+      if [ "$1" = "--small" ]; then
+          head -n 20 packages > p
+          mv p  packages
+      fi
+      echo "Package datset downloaded"
+  fi
+}
+
+source_var() {
+  export IN=
+}
+
+setup_dataset
diff --git a/evaluation/distr_benchmarks/dependency_untangling/nginx.sh b/evaluation/distr_benchmarks/dependency_untangling/nginx.sh
new file mode 100755
index 000000000..afd53af8e
--- /dev/null
+++ b/evaluation/distr_benchmarks/dependency_untangling/nginx.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# tag: nginx logs
+IN=${IN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/log_data}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/nginx-logs}
+mkdir -p ${OUT}
+
+run_tests() {
+    # i don't think we should assign things to $0, however, it works with both
+    IN=$1
+    cat $IN | cut -d "\"" -f3 | cut -d ' ' -f2 | sort | uniq -c | sort -rn   
+    # awk alternative, too slow
+    awk '{print $9}' $IN | sort | uniq -c | sort -rn  
+    # find broken links broken links
+    awk '($9 ~ /404/)' $IN | awk '{print $7}' | sort | uniq -c | sort -rn  
+    # for 502 (bad-gateway) we can run following command:
+    awk '($9 ~ /502/)' $IN | awk '{print $7}' | sort | uniq -c | sort -r  
+    # Who are requesting broken links (or URLs resulting in 502)
+    awk -F\" '($2 ~ "/wp-admin/install.php"){print $1}' $IN | awk '{print $1}' | sort | uniq -c | sort -r  
+    # 404 for php files -mostly hacking attempts
+    awk '($9 ~ /404/)' $IN | awk -F\" '($2 ~ "^GET .*.php")' | awk '{print $7}' | sort | uniq -c | sort -r | head -n 20  
+    ##############################
+    # Most requested URLs ########
+    awk -F\" '{print $2}' $IN  | awk '{print $2}' | sort | uniq -c | sort -r  
+    # Most requested URLs containing XYZ
+    awk -F\" '($2 ~ "ref"){print $2}' $IN | awk '{print $2}' | sort | uniq -c | sort -r
+}
+
+export -f run_tests
+for f in ${IN}/*; do
+    #bash -c 'run_tests $0 $1' $f $f #> /dev/null
+    #run_tests $f > /dev/null
+    logname=$OUT/$(basename $f)
+    run_tests $f > $logname
+done
+
+echo 'done';
diff --git a/evaluation/distr_benchmarks/dependency_untangling/pacaur.sh b/evaluation/distr_benchmarks/dependency_untangling/pacaur.sh
new file mode 100755
index 000000000..b8a76594c
--- /dev/null
+++ b/evaluation/distr_benchmarks/dependency_untangling/pacaur.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+IN=${IN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/packages}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/packages}
+LOGS=${OUT}/logs
+mkdir -p ${OUT} ${LOGS}
+
+info() { echo -e "\e[1m--> $@\e[0m"; }
+mkcd() { mkdir -p "$1" && cd "$1"; }
+
+# check if not running as root
+# test "$UID" -gt 0 || { info "don't run this as root!"; exit; }
+
+
+
+run_tests() {
+    pgk=$1
+    info "create subdirectory for $pkg"
+    mkcd "${OUT}/$pkg"
+
+    # set link to plaintext PKGBUILDs
+    pkgbuild="https://aur.archlinux.org/cgit/aur.git/plain/PKGBUILD?h"
+    
+    info "fetch PKGBUILD for $pkg"
+    curl --insecure -o  PKGBUILD "$pkgbuild=$pkg" 2> /dev/null|| echo ' '
+
+    #info "fetch required pgp keys from PKGBUILD"
+    #gpg --recv-keys $(sed -n "s:^validpgpkeys=('\([0-9A-Fa-fx]\+\)').*$:\1:p" PKGBUILD)
+    info "make and install ..."
+    timeout 100 makedeb-makepkg --format-makedeb -d 2>/dev/null|| echo 'failed'
+    cd -
+}
+
+export -f run_tests
+pkg_count=0
+# loop over required packages
+for pkg in $(cat ${IN} | tr '\n' ' ' ); 
+do
+    pkg_count=$((pkg_count + 1))
+    run_tests $pkg > "${LOGS}"/"$pkg_count.log"
+done
+
+echo 'done';
diff --git a/evaluation/distr_benchmarks/dependency_untangling/pcap.sh b/evaluation/distr_benchmarks/dependency_untangling/pcap.sh
new file mode 100755
index 000000000..d4e1b70ea
--- /dev/null
+++ b/evaluation/distr_benchmarks/dependency_untangling/pcap.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+#tag: pcap analysis
+IN=${IN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/pcap_data}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/pcap-analysis}
+LOGS=${OUT}/logs
+mkdir -p ${LOGS}
+run_tests() {
+    INPUT=$1
+    /usr/sbin/tcpdump -nn -r ${INPUT} -A 'port 53' 2> /dev/null | sort | uniq |grep -Ev '(com|net|org|gov|mil|arpa)' 2> /dev/null
+    # extract URL
+    /usr/sbin/tcpdump -nn -r ${INPUT} -s 0 -v -n -l 2> /dev/null | egrep -i "POST /|GET /|Host:" 2> /dev/null
+    # extract passwords
+    /usr/sbin/tcpdump -nn -r ${INPUT} -s 0 -A -n -l 2> /dev/null | egrep -i "POST /|pwd=|passwd=|password=|Host:" 2> /dev/null
+}
+export -f run_tests
+
+pkg_count=0
+
+for item in ${IN}/*;
+do
+    pkg_count=$((pkg_count + 1));
+    run_tests $item > ${LOGS}/${pkg_count}.log
+done
+
+echo 'done';
diff --git a/evaluation/distr_benchmarks/dependency_untangling/proginf.sh b/evaluation/distr_benchmarks/dependency_untangling/proginf.sh
new file mode 100755
index 000000000..52f33fd04
--- /dev/null
+++ b/evaluation/distr_benchmarks/dependency_untangling/proginf.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+IN=${IN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/node_modules}
+MIR_BIN=${MIR_BIN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/mir-sa/.bin/mir-sa}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/mir}
+mkdir -p ${OUT}/
+pkg_count=0
+run_tests() {
+    cd $1;
+    ${MIR_BIN} -p   2>>${OUT}/error.log
+}
+export -f run_tests
+for item in ${IN}/*;
+do
+    pkg_count=$((pkg_count + 1));
+    run_tests $item > ${OUT}/$pkg_count.log
+done
+
+echo 'done';
diff --git a/evaluation/distr_benchmarks/dependency_untangling/run.distr.sh b/evaluation/distr_benchmarks/dependency_untangling/run.distr.sh
new file mode 100755
index 000000000..8928ed6be
--- /dev/null
+++ b/evaluation/distr_benchmarks/dependency_untangling/run.distr.sh
@@ -0,0 +1,90 @@
+PASH_FLAGS='--width 6 --r_split'
+export TIMEFORMAT=%R
+export dict="$PASH_TOP/evaluation/benchmarks/oneliners/input/dict.txt"
+
+names_scripts=(
+    "MediaConv1;img_convert"
+    "MediaConv2;to_mp3"
+    "Program_Inference;proginf"
+    "LogAnalysis1;nginx"
+    "LogAnalysis2;pcap"
+    # "Genomics_Computation;genomics"
+    "AurPkg;pacaur"
+    "FileEnc1;compress_files"
+    "FileEnc2;encrypt_files"
+  )
+
+oneliners_bash() {
+    seq_times_file="seq.res"
+    seq_outputs_suffix="seq.out"
+    outputs_dir="outputs"
+
+    mkdir -p "$outputs_dir"
+
+    touch "$seq_times_file"
+    cat $seq_times_file > $seq_times_file.d
+    echo executing one-liners $(date) | tee -a "$seq_times_file"
+    echo '' > "$seq_times_file"
+
+    for name_script in ${names_scripts[@]}
+    do
+    IFS=";" read -r -a name_script_parsed <<< "${name_script}"
+    name="${name_script_parsed[0]}"
+    script="${name_script_parsed[1]}"
+    export IN=
+    export OUT=
+
+    printf -v pad %30s
+    padded_script="${script}${pad}"
+    padded_script=${padded_script:0:30}
+
+    seq_outputs_file="${outputs_dir}/${script}.${seq_outputs_suffix}"
+
+    echo "${padded_script}" $({ time ./${script}.sh > "$seq_outputs_file"; } 2>&1) | tee -a "$seq_times_file"
+    done
+}
+
+oneliners_pash(){
+  flags=${1:-$PASH_FLAGS}
+  prefix=${2:-par}
+
+  times_file="$prefix.res"
+  outputs_suffix="$prefix.out"
+  time_suffix="$prefix.time"
+  outputs_dir="outputs"
+  pash_logs_dir="pash_logs_$prefix"
+
+  mkdir -p "$outputs_dir"
+  mkdir -p "$pash_logs_dir"
+
+  touch "$times_file"
+  cat $times_file > $times_file.d
+  echo executing one-liners with $prefix pash $(date) | tee -a "$times_file"
+  echo '' > "$times_file"
+
+  for name_script in ${names_scripts[@]}
+    do
+    IFS=";" read -r -a name_script_parsed <<< "${name_script}"
+    name="${name_script_parsed[0]}"
+    script="${name_script_parsed[1]}"
+
+    export IN=
+    export OUT=
+    
+    printf -v pad %30s
+    padded_script="${script}${pad}"
+    padded_script=${padded_script:0:30}
+
+    outputs_file="${outputs_dir}/${script}.${outputs_suffix}"
+    pash_log="${pash_logs_dir}/${script}.pash.log"
+    single_time_file="${outputs_dir}/${script}.${time_suffix}"
+
+    echo -n "${padded_script}" | tee -a "$times_file"
+    { time "$PASH_TOP/pa.sh" $flags --log_file "${pash_log}" ${script}.sh > "$outputs_file"; } 2> "${single_time_file}"
+    cat "${single_time_file}" | tee -a "$times_file"
+  done
+}
+
+# oneliners_bash
+oneliners_pash "$PASH_FLAGS" "par"
+# oneliners_pash "$PASH_FLAGS --distributed_exec" "distr"
diff --git a/evaluation/distr_benchmarks/dependency_untangling/to_mp3.sh b/evaluation/distr_benchmarks/dependency_untangling/to_mp3.sh
new file mode 100755
index 000000000..1f84bb277
--- /dev/null
+++ b/evaluation/distr_benchmarks/dependency_untangling/to_mp3.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# tag: wav-to-mp3
+IN=${IN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/wav}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/mp3}
+LOGS=${OUT}/logs
+mkdir -p ${LOGS}
+trigrams_aux(){
+    ffmpeg -y -i pipe:0 -f mp3 -ab 192000 pipe:1  2>/dev/null
+}
+
+export -f trigrams_aux
+
+pkg_count=0
+for item in $(hdfs dfs -ls -C /for-loops/wav);
+do
+    pkg_count=$((pkg_count + 1));
+    out="$OUT/$(basename $item).mp3"
+    hdfs dfs -cat $item | trigrams_aux > $out
+done
+
+echo 'done';

From a11812d6d4970ceb3bbd657c9a48818a736c5573 Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Tue, 26 Apr 2022 13:44:10 +0000
Subject: [PATCH 02/37] porting oneliners to use hdfs

---
 .../distr_benchmarks/oneliners/.gitignore     |  1 +
 .../distr_benchmarks/oneliners/README.md      | 18 ++++
 .../distr_benchmarks/oneliners/bi-gram.aux.sh | 96 +++++++++++++++++++
 .../distr_benchmarks/oneliners/bi-grams.sh    | 15 +++
 evaluation/distr_benchmarks/oneliners/diff.sh | 21 ++++
 .../oneliners/input/.gitignore                |  3 +
 .../distr_benchmarks/oneliners/input/setup.sh | 91 ++++++++++++++++++
 .../distr_benchmarks/oneliners/nfa-regex.sh   |  6 ++
 .../distr_benchmarks/oneliners/run.distr.sh   | 90 +++++++++++++++++
 .../distr_benchmarks/oneliners/set-diff.sh    | 20 ++++
 .../oneliners/shortest-scripts.sh             | 11 +++
 .../distr_benchmarks/oneliners/sort-sort.sh   |  6 ++
 evaluation/distr_benchmarks/oneliners/sort.sh |  7 ++
 .../distr_benchmarks/oneliners/spell.sh       | 16 ++++
 .../distr_benchmarks/oneliners/top-n.sh       |  8 ++
 evaluation/distr_benchmarks/oneliners/wf.sh   |  6 ++
 16 files changed, 415 insertions(+)
 create mode 100644 evaluation/distr_benchmarks/oneliners/.gitignore
 create mode 100644 evaluation/distr_benchmarks/oneliners/README.md
 create mode 100755 evaluation/distr_benchmarks/oneliners/bi-gram.aux.sh
 create mode 100755 evaluation/distr_benchmarks/oneliners/bi-grams.sh
 create mode 100755 evaluation/distr_benchmarks/oneliners/diff.sh
 create mode 100644 evaluation/distr_benchmarks/oneliners/input/.gitignore
 create mode 100755 evaluation/distr_benchmarks/oneliners/input/setup.sh
 create mode 100755 evaluation/distr_benchmarks/oneliners/nfa-regex.sh
 create mode 100755 evaluation/distr_benchmarks/oneliners/run.distr.sh
 create mode 100755 evaluation/distr_benchmarks/oneliners/set-diff.sh
 create mode 100755 evaluation/distr_benchmarks/oneliners/shortest-scripts.sh
 create mode 100755 evaluation/distr_benchmarks/oneliners/sort-sort.sh
 create mode 100755 evaluation/distr_benchmarks/oneliners/sort.sh
 create mode 100755 evaluation/distr_benchmarks/oneliners/spell.sh
 create mode 100755 evaluation/distr_benchmarks/oneliners/top-n.sh
 create mode 100755 evaluation/distr_benchmarks/oneliners/wf.sh

diff --git a/evaluation/distr_benchmarks/oneliners/.gitignore b/evaluation/distr_benchmarks/oneliners/.gitignore
new file mode 100644
index 000000000..2211df63d
--- /dev/null
+++ b/evaluation/distr_benchmarks/oneliners/.gitignore
@@ -0,0 +1 @@
+*.txt
diff --git a/evaluation/distr_benchmarks/oneliners/README.md b/evaluation/distr_benchmarks/oneliners/README.md
new file mode 100644
index 000000000..cf6a5381a
--- /dev/null
+++ b/evaluation/distr_benchmarks/oneliners/README.md
@@ -0,0 +1,18 @@
+## Expert One-liners
+
+This directory contains ten scripts collected by several sources, including GitHub, Stackoverflow, and the Unix literature.
+They are written by developers who are (or approximate) experts in Unix shell scripting, and include several Unix classics.
+
+1. `nfa-regex.sh`          Match complex regular-expression over input
+2. `sort.sh`               Sort a text input
+3. `top-n.sh`              Find the top 1000 terms in a document
+4. `wf.sh`                 Calculate the frequency of each word in the document, and sort by frequency
+5. `spell.sh`              Compute mispelled words in an input document
+6. `bi-grams.sh`           Find all 2-grams in a piece of text
+7. `diff.sh`               Compares two streams element by element
+8. `set-diff.sh`           Show the set-difference between two streams (i.e., elements in the first that are not in the second).
+9. `shortest-scripts.sh`   Find the shortest scripts 
+10.`sort-sort.sh`          Calculate sort twice
+
+The `bi-grams.aux.sh` script contains helper functions for `bi-grams.sh`. 
+To generate inputs, run `./generate_inputs`.
diff --git a/evaluation/distr_benchmarks/oneliners/bi-gram.aux.sh b/evaluation/distr_benchmarks/oneliners/bi-gram.aux.sh
new file mode 100755
index 000000000..5f66058b2
--- /dev/null
+++ b/evaluation/distr_benchmarks/oneliners/bi-gram.aux.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+# Auxiliary functions for bi-grams
+
+bigrams_aux()
+{
+    s2=$(mktemp -u)
+    mkfifo $s2
+    tee $s2 |
+        tail -n +2 |
+        paste $s2 - |
+        sed '$d'
+    rm $s2
+}
+
+bigram_aux_map()
+{
+    IN=$1
+    OUT=$2
+    AUX_HEAD=$3
+    AUX_TAIL=$4
+
+    s2=$(mktemp -u)
+    aux1=$(mktemp -u)
+    aux2=$(mktemp -u)
+    aux3=$(mktemp -u)
+    temp=$(mktemp -u)
+
+    mkfifo $s2
+    mkfifo $aux1
+    mkfifo $aux2
+    mkfifo $aux3
+
+    ## New way of doing it using an intermediate file. This is slow
+    ## but doesn't deadlock
+    cat $IN > $temp
+
+    sed '$d' $temp > $aux3 &
+    cat $temp | head -n 1 > $AUX_HEAD &
+    cat $temp | tail -n 1 > $AUX_TAIL &
+    cat $temp | tail -n +2 | paste $aux3 - > $OUT &
+
+    # ## Old way of doing it
+    # cat $IN |
+    #     tee $s2 $aux1 $aux2 |
+    #     tail -n +2 |
+    #     paste $s2 - > $OUT &
+
+    # ## The goal of this is to write the first line of $IN in the $AUX_HEAD
+    # ## stream and the last line of $IN in $AUX_TAIL
+
+    # cat $aux1 | ( head -n 1 > $AUX_HEAD; $PASH_TOP/evaluation/tools/drain_stream.sh ) &
+    # # while IFS= read -r line
+    # # do
+    # #     old_line=$line
+    # # done < $aux2
+    # # echo "$old_line" > $AUX_TAIL
+    # ( tail -n 1 $aux2 > $AUX_TAIL; $PASH_TOP/evaluation/tools/drain_stream.sh ) &
+
+    wait
+
+    rm $temp
+    rm $s2
+    rm $aux1
+    rm $aux2
+    rm $aux3
+}
+
+bigram_aux_reduce()
+{
+    IN1=$1
+    AUX_HEAD1=$2
+    AUX_TAIL1=$3
+    IN2=$4
+    AUX_HEAD2=$5
+    AUX_TAIL2=$6
+    OUT=$7
+    AUX_HEAD_OUT=$8
+    AUX_TAIL_OUT=$9
+
+    temp=$(mktemp -u)
+
+    mkfifo $temp
+
+    cat $AUX_HEAD1 > $AUX_HEAD_OUT &
+    cat $AUX_TAIL2 > $AUX_TAIL_OUT &
+    paste $AUX_TAIL1 $AUX_HEAD2 > $temp &
+    cat $IN1 $temp $IN2 > $OUT &
+
+    wait
+
+    rm $temp
+}
+
+export -f bigrams_aux
+export -f bigram_aux_map
+export -f bigram_aux_reduce
diff --git a/evaluation/distr_benchmarks/oneliners/bi-grams.sh b/evaluation/distr_benchmarks/oneliners/bi-grams.sh
new file mode 100755
index 000000000..a081a05ec
--- /dev/null
+++ b/evaluation/distr_benchmarks/oneliners/bi-grams.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+# Find all 2-grams in a piece of text
+
+IN=${IN:-/1G.txt}
+
+. bi-gram.aux.sh
+
+hdfs dfs -cat $IN |
+  tr -cs A-Za-z '\n' |
+  tr A-Z a-z |
+  bigrams_aux |
+  sort |
+  uniq
+
+
diff --git a/evaluation/distr_benchmarks/oneliners/diff.sh b/evaluation/distr_benchmarks/oneliners/diff.sh
new file mode 100755
index 000000000..9435ad1d7
--- /dev/null
+++ b/evaluation/distr_benchmarks/oneliners/diff.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Compares two streams element by element
+# Taken from https://crashingdaily.wordpress.com/2008/03/06/diff-two-stdout-streams/
+# shuf() { awk 'BEGIN {srand(); OFMT="%.17f"} {print rand(), $0}' "$@" | sort -k1,1n | cut -d ' ' -f2-; }
+
+IN=${IN:-/1G.txt}
+
+mkfifo s1 s2
+
+hdfs dfs -cat $IN |
+  # shuf |
+  tr [:lower:] [:upper:] |
+  sort > s1 &
+
+hdfs dfs -cat $IN |
+  # shuf |
+  tr [:upper:] [:lower:] |
+  sort > s2 &
+
+diff -B s1 s2
+rm s1 s2
diff --git a/evaluation/distr_benchmarks/oneliners/input/.gitignore b/evaluation/distr_benchmarks/oneliners/input/.gitignore
new file mode 100644
index 000000000..047dcd20b
--- /dev/null
+++ b/evaluation/distr_benchmarks/oneliners/input/.gitignore
@@ -0,0 +1,3 @@
+*
+!.gitignore
+!setup.sh
diff --git a/evaluation/distr_benchmarks/oneliners/input/setup.sh b/evaluation/distr_benchmarks/oneliners/input/setup.sh
new file mode 100755
index 000000000..eb08a2d42
--- /dev/null
+++ b/evaluation/distr_benchmarks/oneliners/input/setup.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+
+#set -e
+
+PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)}
+
+# another solution for capturing HTTP status code
+# https://superuser.com/a/590170
+input_files="1M.txt 10M.txt 100M.txt 1G.txt dict.txt 3G.txt 10G.txt 100G.txt all_cmds.txt all_cmdsx100.txt small"
+
+if [ ! -f ./1M.txt ]; then
+    curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt
+    if [ $? -ne 0 ]; then
+        echo 'cannot find 1M.txt -- please contact the developers of pash'
+        exit 1
+    fi
+fi
+
+if [ ! -f ./10M.txt ]; then
+    touch 10M.txt
+    for (( i = 0; i < 10; i++ )); do
+        cat 1M.txt >> 10M.txt
+    done
+fi
+
+if [ ! -f ./100M.txt ]; then
+    touch 100M.txt
+    for (( i = 0; i < 10; i++ )); do
+        cat 10M.txt >> 100M.txt
+    done
+fi
+
+if [ ! -f ./1G.txt ]; then
+    curl -sf 'http://ndr.md/data/dummy/1G.txt' > 1G.txt
+    if [ $? -ne 0 ]; then
+        echo 'cannot find 1G.txt -- please contact the developers of pash'
+        exit 1
+    fi
+fi
+
+# download wamerican-insane dictionary and sort according to machine
+if [ ! -f ./dict.txt ]; then
+    curl -sf 'http://ndr.md/data/dummy/dict.txt' | sort > dict.txt
+    if [ $? -ne 0 ]; then
+        echo 'cannot find dict.txt -- please contact the developers of pash'
+        exit 1
+    fi
+fi
+
+if [ ! -f ./all_cmds.txt ]; then
+    curl -sf 'http://ndr.md/data/dummy/all_cmds.txt' > all_cmds.txt
+    if [ $? -ne 0 ]; then
+        # This should be OK for tests, no need for abort
+        ls /usr/bin/* > all_cmds.txt
+    fi
+fi
+
+hdfs dfs -put ./10M.txt /10M.txt
+hdfs dfs -put ./100M.txt /100M.txt 
+hdfs dfs -put ./1G.txt /1G.txt 
+hdfs dfs -put ./all_cmds.txt
+
+
+if [ "$#" -eq 1 ] && [ "$1" = "--full" ]; then
+    echo "Generating full-size inputs"
+    # FIXME PR: Do we need all of them?
+
+    # if [ ! -f ./3G.txt ]; then
+    #     touch 3G.txt
+    #     for (( i = 0; i < 3; i++ )); do
+    #         cat 1G.txt >> 3G.txt
+    #     done
+    # fi
+    # hdfs dfs -put ./3G.txt /3G.txt
+
+    # if [ ! -f ./10G.txt ]; then
+    #     touch 10G.txt
+    #     for (( i = 0; i < 10; i++ )); do
+    #         cat 1G.txt >> 10G.txt
+    #     done
+    # fi
+    # hdfs dfs -put ./10G.txt /10G.txt 
+
+    if [ ! -f ./all_cmdsx100.txt ]; then
+        touch all_cmdsx100.txt
+        for (( i = 0; i < 100; i++ )); do
+            cat all_cmds.txt >> all_cmdsx100.txt
+        done
+    fi
+    hdfs dfs -put ./all_cmdsx100.txt /all_cmdsx100.txt 
+fi
\ No newline at end of file
diff --git a/evaluation/distr_benchmarks/oneliners/nfa-regex.sh b/evaluation/distr_benchmarks/oneliners/nfa-regex.sh
new file mode 100755
index 000000000..2a2c30718
--- /dev/null
+++ b/evaluation/distr_benchmarks/oneliners/nfa-regex.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+# Match complex regular-expression over input
+
+IN=${IN:-/1G.txt}
+
+hdfs dfs -cat $IN | tr A-Z a-z | grep '\(.\).*\1\(.\).*\2\(.\).*\3\(.\).*\4'
diff --git a/evaluation/distr_benchmarks/oneliners/run.distr.sh b/evaluation/distr_benchmarks/oneliners/run.distr.sh
new file mode 100755
index 000000000..7a3b4a4f2
--- /dev/null
+++ b/evaluation/distr_benchmarks/oneliners/run.distr.sh
@@ -0,0 +1,90 @@
+PASH_FLAGS='--width 6 --r_split'
+export TIMEFORMAT=%R
+export dict="$PASH_TOP/evaluation/benchmarks/oneliners/input/dict.txt"
+
+scripts_inputs=(
+      "nfa-regex;100M.txt"
+      "sort;3G.txt"
+      "top-n;3G.txt"
+      "wf;3G.txt"
+      "spell;3G.txt"
+      "diff;3G.txt"
+      "bi-grams;3G.txt"
+      "set-diff;3G.txt"
+      "sort-sort;3G.txt"
+      "shortest-scripts;all_cmdsx100.txt"
+  )
+
+oneliners_bash() {
+    seq_times_file="seq.res"
+    seq_outputs_suffix="seq.out"
+    outputs_dir="outputs"
+
+    mkdir -p "$outputs_dir"
+
+    touch "$seq_times_file"
+    cat $seq_times_file > $seq_times_file.d
+    echo executing one-liners $(date) | tee -a "$seq_times_file"
+    echo '' > "$seq_times_file"
+
+    for script_input in ${scripts_inputs[@]}
+    do
+    IFS=";" read -r -a script_input_parsed <<< "${script_input}"
+    script="${script_input_parsed[0]}"
+    input="${script_input_parsed[1]}"
+
+    export IN="/$input"
+
+    printf -v pad %30s
+    padded_script="${script}${pad}"
+    padded_script=${padded_script:0:30}
+
+    seq_outputs_file="${outputs_dir}/${script}.${seq_outputs_suffix}"
+
+    echo "${padded_script}" $({ time ./${script}.sh > "$seq_outputs_file"; } 2>&1) | tee -a "$seq_times_file"
+    done
+}
+
+oneliners_pash(){
+  flags=${1:-$PASH_FLAGS}
+  prefix=${2:-par}
+
+  times_file="$prefix.res"
+  outputs_suffix="$prefix.out"
+  time_suffix="$prefix.time"
+  outputs_dir="outputs"
+  pash_logs_dir="pash_logs_$prefix"
+
+  mkdir -p "$outputs_dir"
+  mkdir -p "$pash_logs_dir"
+
+  touch "$times_file"
+  cat $times_file > $times_file.d
+  echo executing one-liners with $prefix pash $(date) | tee -a "$times_file"
+  echo '' > "$times_file"
+
+  for script_input in ${scripts_inputs[@]}
+  do
+    IFS=";" read -r -a script_input_parsed <<< "${script_input}"
+    script="${script_input_parsed[0]}"
+    input="${script_input_parsed[1]}"
+
+    export IN="/$input"
+
+    printf -v pad %30s
+    padded_script="${script}${pad}"
+    padded_script=${padded_script:0:30}
+
+    outputs_file="${outputs_dir}/${script}.${outputs_suffix}"
+    pash_log="${pash_logs_dir}/${script}.pash.log"
+    single_time_file="${outputs_dir}/${script}.${time_suffix}"
+
+    echo -n "${padded_script}" | tee -a "$times_file"
+    { time "$PASH_TOP/pa.sh" $flags --log_file "${pash_log}" ${script}.sh > "$outputs_file"; } 2> "${single_time_file}"
+    cat "${single_time_file}" | tee -a "$times_file"
+  done
+}
+
+# oneliners_bash
+oneliners_pash "$PASH_FLAGS" "par"
+oneliners_pash "$PASH_FLAGS --distributed_exec" "distr"
diff --git a/evaluation/distr_benchmarks/oneliners/set-diff.sh b/evaluation/distr_benchmarks/oneliners/set-diff.sh
new file mode 100755
index 000000000..039e6996f
--- /dev/null
+++ b/evaluation/distr_benchmarks/oneliners/set-diff.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+# Show the set-difference between two streams (i.e., elements in the first that are not in the second).
+# https://stackoverflow.com/questions/2509533/bash-linux-set-difference-between-two-text-files
+
+IN=${IN:-/1G.txt}
+
+mkfifo s1 s2
+
+hdfs dfs -cat $IN |
+    cut -d ' ' -f 1 |
+    tr [:lower:] [:upper:] |
+    sort > s1 &
+
+hdfs dfs -cat $IN |
+    cut -d ' ' -f 1 |
+    sort > s2 &
+
+comm -23 s1 s2
+
+rm s1 s2
diff --git a/evaluation/distr_benchmarks/oneliners/shortest-scripts.sh b/evaluation/distr_benchmarks/oneliners/shortest-scripts.sh
new file mode 100755
index 000000000..f6bac1b15
--- /dev/null
+++ b/evaluation/distr_benchmarks/oneliners/shortest-scripts.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+# Find the shortest scripts 
+# From "Wicked Cool Shell Scripts", 2nd Ed., pg. 7
+# +p.95 multiple sed
+# +p.XX crawler
+
+# FIX: Input here should be a set of commands, more precisely, the ones on this specific machine.
+
+IN=${IN:-/all_cmds.txt}
+
+hdfs dfs -cat $IN | xargs file | grep "shell script" | cut -d: -f1 | xargs -L 1 wc -l | grep -v '^0$' | sort -n | head -15
diff --git a/evaluation/distr_benchmarks/oneliners/sort-sort.sh b/evaluation/distr_benchmarks/oneliners/sort-sort.sh
new file mode 100755
index 000000000..7b51ed889
--- /dev/null
+++ b/evaluation/distr_benchmarks/oneliners/sort-sort.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+# Calculate sort twice
+
+IN=${IN:-/1G.txt}
+
+hdfs dfs -cat $IN | tr A-Z a-z | sort | sort -r
diff --git a/evaluation/distr_benchmarks/oneliners/sort.sh b/evaluation/distr_benchmarks/oneliners/sort.sh
new file mode 100755
index 000000000..29cffa1cf
--- /dev/null
+++ b/evaluation/distr_benchmarks/oneliners/sort.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+# Sort input
+
+IN=${IN:-/1G.txt}
+
+hdfs dfs -cat $IN | sort
+
diff --git a/evaluation/distr_benchmarks/oneliners/spell.sh b/evaluation/distr_benchmarks/oneliners/spell.sh
new file mode 100755
index 000000000..a5803a5c5
--- /dev/null
+++ b/evaluation/distr_benchmarks/oneliners/spell.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# Calculate mispelled words in an input
+# https://dl.acm.org/doi/10.1145/3532.315102
+IN=${IN:-/1G.txt}
+dict=${dict:-$PASH_TOP/evaluation/benchmarks/oneliners/input/dict.txt}
+
+hdfs dfs -cat $IN |
+    iconv -f utf-8 -t ascii//translit | # remove non utf8 characters
+    # groff -t -e -mandoc -Tascii |  # remove formatting commands
+    col -bx |                      # remove backspaces / linefeeds
+    tr -cs A-Za-z '\n' |
+    tr A-Z a-z |                   # map upper to lower case
+    tr -d '[:punct:]' |            # remove punctuation
+    sort |                         # put words in alphabetical order
+    uniq |                         # remove duplicate words
+    comm -23 - $dict               # report words not in dictionary 
diff --git a/evaluation/distr_benchmarks/oneliners/top-n.sh b/evaluation/distr_benchmarks/oneliners/top-n.sh
new file mode 100755
index 000000000..ac6fbb50e
--- /dev/null
+++ b/evaluation/distr_benchmarks/oneliners/top-n.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+# Top-N (1000) terms
+# from https://dl.acm.org/doi/10.1145/5948.315654
+
+IN=${IN:-/1G.txt}
+
+hdfs dfs -cat $IN | tr -cs A-Za-z '\n' | tr A-Z a-z | sort | uniq -c | sort -rn | sed 100q
+
diff --git a/evaluation/distr_benchmarks/oneliners/wf.sh b/evaluation/distr_benchmarks/oneliners/wf.sh
new file mode 100755
index 000000000..a8a885775
--- /dev/null
+++ b/evaluation/distr_benchmarks/oneliners/wf.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+# Calculate the frequency of each word in the document, and sort by frequency
+
+IN=${IN:-/10M.txt}
+
+hdfs dfs -cat $IN |  tr -cs A-Za-z '\n' | tr A-Z a-z | sort | uniq -c | sort -rn 

From 748f7ab79e933d4107c483d0f85ad0a33f39bf0f Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Tue, 26 Apr 2022 13:47:11 +0000
Subject: [PATCH 03/37] added gitingore for dependecy_untagling

---
 .../distr_benchmarks/dependency_untangling/input/.gitignore   | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 evaluation/distr_benchmarks/dependency_untangling/input/.gitignore

diff --git a/evaluation/distr_benchmarks/dependency_untangling/input/.gitignore b/evaluation/distr_benchmarks/dependency_untangling/input/.gitignore
new file mode 100644
index 000000000..85940a3d2
--- /dev/null
+++ b/evaluation/distr_benchmarks/dependency_untangling/input/.gitignore
@@ -0,0 +1,4 @@
+*
+!.gitignore
+!setup.sh
+!install-deps.sh

From f89cbddb78e993d9d251d31382207c0100ff7014 Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Wed, 27 Apr 2022 20:08:05 +0000
Subject: [PATCH 04/37] add stateless and pure function annotations

Signed-off-by: Tammam Mustafa <tammammusatafa@gmail.com>
---
 annotations/pure_func.json                           | 12 ++++++++++++
 annotations/stateless_func.json                      | 12 ++++++++++++
 .../distr_benchmarks/dependency_untangling/to_mp3.sh |  6 +++---
 3 files changed, 27 insertions(+), 3 deletions(-)
 create mode 100644 annotations/pure_func.json
 create mode 100644 annotations/stateless_func.json

diff --git a/annotations/pure_func.json b/annotations/pure_func.json
new file mode 100644
index 000000000..1d9163a3f
--- /dev/null
+++ b/annotations/pure_func.json
@@ -0,0 +1,12 @@
+{
+    "command": "pure_func",
+    "cases":
+    [
+        {
+            "predicate": "default",
+            "class": "pure",
+            "inputs": ["stdin"],
+            "outputs": ["stdout"]
+        }
+    ]
+}
diff --git a/annotations/stateless_func.json b/annotations/stateless_func.json
new file mode 100644
index 000000000..f6a62ec6c
--- /dev/null
+++ b/annotations/stateless_func.json
@@ -0,0 +1,12 @@
+{
+    "command": "stateless_func",
+    "cases":
+    [
+        {
+            "predicate": "default",
+            "class": "stateless",
+            "inputs": ["stdin"],
+            "outputs": ["stdout"]
+        }
+    ]
+}
diff --git a/evaluation/distr_benchmarks/dependency_untangling/to_mp3.sh b/evaluation/distr_benchmarks/dependency_untangling/to_mp3.sh
index 1f84bb277..c94a75b49 100755
--- a/evaluation/distr_benchmarks/dependency_untangling/to_mp3.sh
+++ b/evaluation/distr_benchmarks/dependency_untangling/to_mp3.sh
@@ -4,18 +4,18 @@ IN=${IN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/wav}
 OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/mp3}
 LOGS=${OUT}/logs
 mkdir -p ${LOGS}
-trigrams_aux(){
+pure_func(){
     ffmpeg -y -i pipe:0 -f mp3 -ab 192000 pipe:1  2>/dev/null
 }
 
-export -f trigrams_aux
+export -f pure_func
 
 pkg_count=0
 for item in $(hdfs dfs -ls -C /for-loops/wav);
 do
     pkg_count=$((pkg_count + 1));
     out="$OUT/$(basename $item).mp3"
-    hdfs dfs -cat $item | trigrams_aux > $out
+    hdfs dfs -cat $item | pure_func > $out
 done
 
 echo 'done';

From ab5c3b94984f3464f71e98fee9ea4609feb30b81 Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Wed, 27 Apr 2022 21:52:48 +0000
Subject: [PATCH 05/37] improve oneliners hdfs setup script

Signed-off-by: Tammam Mustafa <tammammusatafa@gmail.com>
---
 .../distr_benchmarks/oneliners/input/setup.sh | 60 ++++++++++---------
 1 file changed, 31 insertions(+), 29 deletions(-)

diff --git a/evaluation/distr_benchmarks/oneliners/input/setup.sh b/evaluation/distr_benchmarks/oneliners/input/setup.sh
index eb08a2d42..b7d0eb1e7 100755
--- a/evaluation/distr_benchmarks/oneliners/input/setup.sh
+++ b/evaluation/distr_benchmarks/oneliners/input/setup.sh
@@ -3,10 +3,12 @@
 #set -e
 
 PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)}
+REPLICATION_FACTOR=2
 
 # another solution for capturing HTTP status code
 # https://superuser.com/a/590170
-input_files="1M.txt 10M.txt 100M.txt 1G.txt dict.txt 3G.txt 10G.txt 100G.txt all_cmds.txt all_cmdsx100.txt small"
+input_files=("1M.txt" "10M.txt" "100M.txt" "1G.txt" "all_cmds.txt" "all_cmdsx100.txt")
+local_fils=("dict.txt")
 
 if [ ! -f ./1M.txt ]; then
     curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt
@@ -55,37 +57,37 @@ if [ ! -f ./all_cmds.txt ]; then
     fi
 fi
 
-hdfs dfs -put ./10M.txt /10M.txt
-hdfs dfs -put ./100M.txt /100M.txt 
-hdfs dfs -put ./1G.txt /1G.txt 
-hdfs dfs -put ./all_cmds.txt
+if [ ! -f ./all_cmdsx100.txt ]; then
+        touch all_cmdsx100.txt
+        for (( i = 0; i < 100; i++ )); do
+            cat all_cmds.txt >> all_cmdsx100.txt
+        done
+fi
 
 
 if [ "$#" -eq 1 ] && [ "$1" = "--full" ]; then
     echo "Generating full-size inputs"
-    # FIXME PR: Do we need all of them?
-
-    # if [ ! -f ./3G.txt ]; then
-    #     touch 3G.txt
-    #     for (( i = 0; i < 3; i++ )); do
-    #         cat 1G.txt >> 3G.txt
-    #     done
-    # fi
-    # hdfs dfs -put ./3G.txt /3G.txt
-
-    # if [ ! -f ./10G.txt ]; then
-    #     touch 10G.txt
-    #     for (( i = 0; i < 10; i++ )); do
-    #         cat 1G.txt >> 10G.txt
-    #     done
-    # fi
-    # hdfs dfs -put ./10G.txt /10G.txt 
-
-    if [ ! -f ./all_cmdsx100.txt ]; then
-        touch all_cmdsx100.txt
-        for (( i = 0; i < 100; i++ )); do
-            cat all_cmds.txt >> all_cmdsx100.txt
+
+
+    if [ ! -f ./3G.txt ]; then
+        touch 3G.txt
+        for (( i = 0; i < 3; i++ )); do
+            cat 1G.txt >> 3G.txt
+        done
+    fi
+    input_files+=("3G.txt")
+
+    if [ ! -f ./10G.txt ]; then
+        touch 10G.txt
+        for (( i = 0; i < 10; i++ )); do
+            cat 1G.txt >> 10G.txt
         done
     fi
-    hdfs dfs -put ./all_cmdsx100.txt /all_cmdsx100.txt 
-fi
\ No newline at end of file
+    input_files+=("10G.txt")
+fi
+
+
+for file in "${input_files[@]}"; do
+    hdfs dfs -Ddfs.replication=$REPLICATION_FACTOR  -put $file /$file
+    rm $file # remove local file after putting it into hdfs
+done
\ No newline at end of file

From 32a0b86fd3b30879eb91074453bea888f090e4d4 Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Sun, 1 May 2022 12:51:46 +0000
Subject: [PATCH 06/37] fix path bug in spell.sh

---
 evaluation/distr_benchmarks/oneliners/spell.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evaluation/distr_benchmarks/oneliners/spell.sh b/evaluation/distr_benchmarks/oneliners/spell.sh
index a5803a5c5..7928babe4 100755
--- a/evaluation/distr_benchmarks/oneliners/spell.sh
+++ b/evaluation/distr_benchmarks/oneliners/spell.sh
@@ -2,7 +2,7 @@
 # Calculate mispelled words in an input
 # https://dl.acm.org/doi/10.1145/3532.315102
 IN=${IN:-/1G.txt}
-dict=${dict:-$PASH_TOP/evaluation/benchmarks/oneliners/input/dict.txt}
+dict=${dict:-$PASH_TOP/evaluation/distr_benchmarks/oneliners/input/dict.txt}
 
 hdfs dfs -cat $IN |
     iconv -f utf-8 -t ascii//translit | # remove non utf8 characters

From 2b0876b0eda04510fdf33f8dd2e57920871f8094 Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Sun, 1 May 2022 13:00:17 +0000
Subject: [PATCH 07/37] allow varying replication factor in tests

---
 .../distr_benchmarks/oneliners/input/setup.sh |  5 +++--
 .../distr_benchmarks/oneliners/run.distr.sh   | 20 +++++++++++++------
 .../oneliners/shortest-scripts.sh             |  2 +-
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/evaluation/distr_benchmarks/oneliners/input/setup.sh b/evaluation/distr_benchmarks/oneliners/input/setup.sh
index b7d0eb1e7..0ea6efd6c 100755
--- a/evaluation/distr_benchmarks/oneliners/input/setup.sh
+++ b/evaluation/distr_benchmarks/oneliners/input/setup.sh
@@ -86,8 +86,9 @@ if [ "$#" -eq 1 ] && [ "$1" = "--full" ]; then
     input_files+=("10G.txt")
 fi
 
-
+# Add files with different replication factors
 for file in "${input_files[@]}"; do
-    hdfs dfs -Ddfs.replication=$REPLICATION_FACTOR  -put $file /$file
+    hdfs dfs -Ddfs.replication=1  -put $file /rep1_$file
+    hdfs dfs -Ddfs.replication=3  -put $file /rep3_$file
     rm $file # remove local file after putting it into hdfs
 done
\ No newline at end of file
diff --git a/evaluation/distr_benchmarks/oneliners/run.distr.sh b/evaluation/distr_benchmarks/oneliners/run.distr.sh
index 7a3b4a4f2..699a23228 100755
--- a/evaluation/distr_benchmarks/oneliners/run.distr.sh
+++ b/evaluation/distr_benchmarks/oneliners/run.distr.sh
@@ -19,6 +19,7 @@ oneliners_bash() {
     seq_times_file="seq.res"
     seq_outputs_suffix="seq.out"
     outputs_dir="outputs"
+    rep=${3:-rep3}
 
     mkdir -p "$outputs_dir"
 
@@ -33,7 +34,7 @@ oneliners_bash() {
     script="${script_input_parsed[0]}"
     input="${script_input_parsed[1]}"
 
-    export IN="/$input"
+    export IN="/$rep\_$input"
 
     printf -v pad %30s
     padded_script="${script}${pad}"
@@ -48,6 +49,8 @@ oneliners_bash() {
 oneliners_pash(){
   flags=${1:-$PASH_FLAGS}
   prefix=${2:-par}
+  rep=${3:-rep3}
+  prefix=$prefix\_$rep
 
   times_file="$prefix.res"
   outputs_suffix="$prefix.out"
@@ -60,7 +63,7 @@ oneliners_pash(){
 
   touch "$times_file"
   cat $times_file > $times_file.d
-  echo executing one-liners with $prefix pash $(date) | tee -a "$times_file"
+  echo executing one-liners with $prefix pash with data $rep $(date) | tee -a "$times_file"
   echo '' > "$times_file"
 
   for script_input in ${scripts_inputs[@]}
@@ -69,7 +72,7 @@ oneliners_pash(){
     script="${script_input_parsed[0]}"
     input="${script_input_parsed[1]}"
 
-    export IN="/$input"
+    export IN="/$rep\_$input"
 
     printf -v pad %30s
     padded_script="${script}${pad}"
@@ -85,6 +88,11 @@ oneliners_pash(){
   done
 }
 
-# oneliners_bash
-oneliners_pash "$PASH_FLAGS" "par"
-oneliners_pash "$PASH_FLAGS --distributed_exec" "distr"
+oneliners_bash "rep1"
+oneliners_bash "rep3"
+
+oneliners_pash "$PASH_FLAGS" "par" "rep1"
+oneliners_pash "$PASH_FLAGS" "par" "rep3"
+
+oneliners_pash "$PASH_FLAGS --distributed_exec" "distr" "rep1"
+oneliners_pash "$PASH_FLAGS --distributed_exec" "distr" "rep3"
diff --git a/evaluation/distr_benchmarks/oneliners/shortest-scripts.sh b/evaluation/distr_benchmarks/oneliners/shortest-scripts.sh
index f6bac1b15..63a5bc3d9 100755
--- a/evaluation/distr_benchmarks/oneliners/shortest-scripts.sh
+++ b/evaluation/distr_benchmarks/oneliners/shortest-scripts.sh
@@ -6,6 +6,6 @@
 
 # FIX: Input here should be a set of commands, more precisely, the ones on this specific machine.
 
-IN=${IN:-/all_cmds.txt}
+IN=${IN:-/all_cmdsx100.txt}
 
 hdfs dfs -cat $IN | xargs file | grep "shell script" | cut -d: -f1 | xargs -L 1 wc -l | grep -v '^0$' | sort -n | head -15

From 906074254ed7c1ad930cbddfe521fc4da1113ebc Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Wed, 4 May 2022 00:12:00 +0000
Subject: [PATCH 08/37] improve benchmark scripts

---
 evaluation/distr_benchmarks/oneliners/input/setup.sh |  8 +++++---
 evaluation/distr_benchmarks/oneliners/run.distr.sh   | 12 +++++++-----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/evaluation/distr_benchmarks/oneliners/input/setup.sh b/evaluation/distr_benchmarks/oneliners/input/setup.sh
index 0ea6efd6c..3d4921c22 100755
--- a/evaluation/distr_benchmarks/oneliners/input/setup.sh
+++ b/evaluation/distr_benchmarks/oneliners/input/setup.sh
@@ -1,15 +1,18 @@
 #!/bin/bash
-
 #set -e
 
 PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)}
-REPLICATION_FACTOR=2
 
 # another solution for capturing HTTP status code
 # https://superuser.com/a/590170
 input_files=("1M.txt" "10M.txt" "100M.txt" "1G.txt" "all_cmds.txt" "all_cmdsx100.txt")
 local_fils=("dict.txt")
 
+if [[ "$1" == "-c" ]]; then
+    rm -f $input_files "3G.txt" "10G.txt"
+    exit
+fi
+
 if [ ! -f ./1M.txt ]; then
     curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt
     if [ $? -ne 0 ]; then
@@ -90,5 +93,4 @@ fi
 for file in "${input_files[@]}"; do
     hdfs dfs -Ddfs.replication=1  -put $file /rep1_$file
     hdfs dfs -Ddfs.replication=3  -put $file /rep3_$file
-    rm $file # remove local file after putting it into hdfs
 done
\ No newline at end of file
diff --git a/evaluation/distr_benchmarks/oneliners/run.distr.sh b/evaluation/distr_benchmarks/oneliners/run.distr.sh
index 699a23228..dd27315e9 100755
--- a/evaluation/distr_benchmarks/oneliners/run.distr.sh
+++ b/evaluation/distr_benchmarks/oneliners/run.distr.sh
@@ -1,9 +1,9 @@
-PASH_FLAGS='--width 6 --r_split'
+PASH_FLAGS='--width 8 --r_split'
 export TIMEFORMAT=%R
 export dict="$PASH_TOP/evaluation/benchmarks/oneliners/input/dict.txt"
 
 scripts_inputs=(
-      "nfa-regex;100M.txt"
+      "nfa-regex;1G.txt"
       "sort;3G.txt"
       "top-n;3G.txt"
       "wf;3G.txt"
@@ -16,10 +16,10 @@ scripts_inputs=(
   )
 
 oneliners_bash() {
-    seq_times_file="seq.res"
-    seq_outputs_suffix="seq.out"
     outputs_dir="outputs"
-    rep=${3:-rep3}
+    rep=${1:-rep3}
+    seq_times_file="$rep\_seq.res"
+    seq_outputs_suffix="$rep\_seq.out"
 
     mkdir -p "$outputs_dir"
 
@@ -35,6 +35,7 @@ oneliners_bash() {
     input="${script_input_parsed[1]}"
 
     export IN="/$rep\_$input"
+    export dict=
 
     printf -v pad %30s
     padded_script="${script}${pad}"
@@ -73,6 +74,7 @@ oneliners_pash(){
     input="${script_input_parsed[1]}"
 
     export IN="/$rep\_$input"
+    export dict=
 
     printf -v pad %30s
     padded_script="${script}${pad}"

From bc03de75187a5161b54a9adfd5a0cc7c6415e568 Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Sun, 8 May 2022 13:48:49 +0000
Subject: [PATCH 09/37] fix small bug

---
 evaluation/distr_benchmarks/oneliners/run.distr.sh | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/evaluation/distr_benchmarks/oneliners/run.distr.sh b/evaluation/distr_benchmarks/oneliners/run.distr.sh
index dd27315e9..84913531c 100755
--- a/evaluation/distr_benchmarks/oneliners/run.distr.sh
+++ b/evaluation/distr_benchmarks/oneliners/run.distr.sh
@@ -1,6 +1,8 @@
 PASH_FLAGS='--width 8 --r_split'
 export TIMEFORMAT=%R
-export dict="$PASH_TOP/evaluation/benchmarks/oneliners/input/dict.txt"
+export dict="$PASH_TOP/evaluation/distr_benchmarks/oneliners/input/dict.txt"
+curl -sf 'http://ndr.md/data/dummy/dict.txt' | sort > $dict
+
 
 scripts_inputs=(
       "nfa-regex;1G.txt"
@@ -18,8 +20,8 @@ scripts_inputs=(
 oneliners_bash() {
     outputs_dir="outputs"
     rep=${1:-rep3}
-    seq_times_file="$rep\_seq.res"
-    seq_outputs_suffix="$rep\_seq.out"
+    seq_times_file=$rep"_seq.res"
+    seq_outputs_suffix=$rep"_seq.out"
 
     mkdir -p "$outputs_dir"
 
@@ -34,7 +36,7 @@ oneliners_bash() {
     script="${script_input_parsed[0]}"
     input="${script_input_parsed[1]}"
 
-    export IN="/$rep\_$input"
+    export IN=/$rep\_$input
     export dict=
 
     printf -v pad %30s
@@ -73,7 +75,7 @@ oneliners_pash(){
     script="${script_input_parsed[0]}"
     input="${script_input_parsed[1]}"
 
-    export IN="/$rep\_$input"
+    export IN=/$rep\_$input
     export dict=
 
     printf -v pad %30s

From e933a36d6e5bafb5a2a5407598300b360d02b208 Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Mon, 30 May 2022 10:55:15 -0400
Subject: [PATCH 10/37] port nlp scripts to distributed exec

---
 evaluation/distr_benchmarks/nlp/.gitignore    |  2 +
 evaluation/distr_benchmarks/nlp/1_1.sh        | 15 +++++
 evaluation/distr_benchmarks/nlp/2_1.sh        | 17 ++++++
 evaluation/distr_benchmarks/nlp/2_2.sh        | 16 +++++
 evaluation/distr_benchmarks/nlp/3_1.sh        | 16 +++++
 evaluation/distr_benchmarks/nlp/3_2.sh        | 16 +++++
 evaluation/distr_benchmarks/nlp/3_3.sh        | 16 +++++
 evaluation/distr_benchmarks/nlp/4_3.sh        | 19 ++++++
 evaluation/distr_benchmarks/nlp/4_3b.sh       | 25 ++++++++
 evaluation/distr_benchmarks/nlp/6_1.sh        | 27 +++++++++
 evaluation/distr_benchmarks/nlp/6_1_1.sh      | 16 +++++
 evaluation/distr_benchmarks/nlp/6_1_2.sh      | 16 +++++
 evaluation/distr_benchmarks/nlp/6_2.sh        | 18 ++++++
 evaluation/distr_benchmarks/nlp/6_3.sh        | 16 +++++
 evaluation/distr_benchmarks/nlp/6_4.sh        | 16 +++++
 evaluation/distr_benchmarks/nlp/6_5.sh        | 16 +++++
 evaluation/distr_benchmarks/nlp/6_7.sh        | 19 ++++++
 evaluation/distr_benchmarks/nlp/7_1.sh        | 16 +++++
 evaluation/distr_benchmarks/nlp/7_2.sh        | 16 +++++
 evaluation/distr_benchmarks/nlp/8.2_1.sh      | 16 +++++
 evaluation/distr_benchmarks/nlp/8.2_2.sh      | 26 ++++++++
 evaluation/distr_benchmarks/nlp/8.3_2.sh      | 24 ++++++++
 evaluation/distr_benchmarks/nlp/8.3_3.sh      | 25 ++++++++
 evaluation/distr_benchmarks/nlp/8_1.sh        | 23 ++++++++
 .../distr_benchmarks/nlp/input/.gitignore     |  4 ++
 .../distr_benchmarks/nlp/input/setup.sh       | 59 +++++++++++++++++++
 26 files changed, 495 insertions(+)
 create mode 100644 evaluation/distr_benchmarks/nlp/.gitignore
 create mode 100755 evaluation/distr_benchmarks/nlp/1_1.sh
 create mode 100755 evaluation/distr_benchmarks/nlp/2_1.sh
 create mode 100755 evaluation/distr_benchmarks/nlp/2_2.sh
 create mode 100755 evaluation/distr_benchmarks/nlp/3_1.sh
 create mode 100755 evaluation/distr_benchmarks/nlp/3_2.sh
 create mode 100755 evaluation/distr_benchmarks/nlp/3_3.sh
 create mode 100755 evaluation/distr_benchmarks/nlp/4_3.sh
 create mode 100755 evaluation/distr_benchmarks/nlp/4_3b.sh
 create mode 100755 evaluation/distr_benchmarks/nlp/6_1.sh
 create mode 100755 evaluation/distr_benchmarks/nlp/6_1_1.sh
 create mode 100755 evaluation/distr_benchmarks/nlp/6_1_2.sh
 create mode 100755 evaluation/distr_benchmarks/nlp/6_2.sh
 create mode 100755 evaluation/distr_benchmarks/nlp/6_3.sh
 create mode 100755 evaluation/distr_benchmarks/nlp/6_4.sh
 create mode 100755 evaluation/distr_benchmarks/nlp/6_5.sh
 create mode 100755 evaluation/distr_benchmarks/nlp/6_7.sh
 create mode 100755 evaluation/distr_benchmarks/nlp/7_1.sh
 create mode 100755 evaluation/distr_benchmarks/nlp/7_2.sh
 create mode 100755 evaluation/distr_benchmarks/nlp/8.2_1.sh
 create mode 100755 evaluation/distr_benchmarks/nlp/8.2_2.sh
 create mode 100755 evaluation/distr_benchmarks/nlp/8.3_2.sh
 create mode 100755 evaluation/distr_benchmarks/nlp/8.3_3.sh
 create mode 100755 evaluation/distr_benchmarks/nlp/8_1.sh
 create mode 100644 evaluation/distr_benchmarks/nlp/input/.gitignore
 create mode 100755 evaluation/distr_benchmarks/nlp/input/setup.sh

diff --git a/evaluation/distr_benchmarks/nlp/.gitignore b/evaluation/distr_benchmarks/nlp/.gitignore
new file mode 100644
index 000000000..1dd206e6f
--- /dev/null
+++ b/evaluation/distr_benchmarks/nlp/.gitignore
@@ -0,0 +1,2 @@
+exodus
+genesis
diff --git a/evaluation/distr_benchmarks/nlp/1_1.sh b/evaluation/distr_benchmarks/nlp/1_1.sh
new file mode 100755
index 000000000..7ff63c21a
--- /dev/null
+++ b/evaluation/distr_benchmarks/nlp/1_1.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+# tag: count_words
+
+IN=${IN:-/nlp/pg/}
+OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/1_1/}
+ENTRIES=${ENTRIES:-1060}
+mkdir -p "$OUT"
+
+for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
+do
+    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort | uniq -c > ${OUT}/${input}.out
+done
+
+echo 'done';
+rm -rf "${OUT}"
diff --git a/evaluation/distr_benchmarks/nlp/2_1.sh b/evaluation/distr_benchmarks/nlp/2_1.sh
new file mode 100755
index 000000000..4e35100a8
--- /dev/null
+++ b/evaluation/distr_benchmarks/nlp/2_1.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+# tag: merge_upper
+# set -e
+
+# Merge upper and lower counts
+IN=${IN:-/nlp/pg/}
+OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/2_1/}
+ENTRIES=${ENTRIES:-1060}
+mkdir -p "$OUT"
+
+for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
+do
+    hdfs dfs -cat $IN/$input | tr '[a-z]' '[A-Z]' | tr -sc '[A-Z]' '[\012*]' | sort | uniq -c > ${OUT}/${input}.out
+done
+
+echo 'done';
+rm -rf "${OUT}"
diff --git a/evaluation/distr_benchmarks/nlp/2_2.sh b/evaluation/distr_benchmarks/nlp/2_2.sh
new file mode 100755
index 000000000..8111b23aa
--- /dev/null
+++ b/evaluation/distr_benchmarks/nlp/2_2.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# tag: count_vowel_seq
+# set -e 
+
+IN=${IN:-/nlp/pg/}
+OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/2_2/}
+ENTRIES=${ENTRIES:-1060}
+mkdir -p "$OUT"
+
+for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
+do
+    hdfs dfs -cat $IN/$input | tr 'a-z' '[A-Z]' | tr -sc 'AEIOU' '[\012*]'| sort | uniq -c  > ${OUT}/${input}.out
+done
+
+echo 'done';
+rm -rf "${OUT}"
diff --git a/evaluation/distr_benchmarks/nlp/3_1.sh b/evaluation/distr_benchmarks/nlp/3_1.sh
new file mode 100755
index 000000000..6082bb1c6
--- /dev/null
+++ b/evaluation/distr_benchmarks/nlp/3_1.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# tag: sort
+# set -e
+
+IN=${IN:-/nlp/pg/}
+OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/3_1/}
+ENTRIES=${ENTRIES:-1060}
+mkdir -p "$OUT"
+
+for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
+do
+    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort | uniq -c | sort -nr > ${OUT}/${input}.out
+done
+
+echo 'done';
+rm -rf "${OUT}"
diff --git a/evaluation/distr_benchmarks/nlp/3_2.sh b/evaluation/distr_benchmarks/nlp/3_2.sh
new file mode 100755
index 000000000..571481d1d
--- /dev/null
+++ b/evaluation/distr_benchmarks/nlp/3_2.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# tag: sort_words_by_folding
+# set -e
+
+IN=${IN:-/nlp/pg/}
+OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/3_2/}
+ENTRIES=${ENTRIES:-1060}
+mkdir -p "$OUT"
+
+for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
+do
+    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort | uniq -c | sort -f > ${OUT}/${input}
+done
+
+echo 'done';
+rm -rf ${OUT}
diff --git a/evaluation/distr_benchmarks/nlp/3_3.sh b/evaluation/distr_benchmarks/nlp/3_3.sh
new file mode 100755
index 000000000..ff67ea089
--- /dev/null
+++ b/evaluation/distr_benchmarks/nlp/3_3.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# tag: sort_words_by_rhyming.sh
+# set -e
+
+IN=${IN:-/nlp/pg/}
+OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/3_3/}
+ENTRIES=${ENTRIES:-1060}
+mkdir -p "$OUT"
+
+for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
+do
+    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort | uniq -c | rev | sort | rev > ${OUT}/${input}.out
+done
+
+echo 'done';
+rm -rf "${OUT}"
diff --git a/evaluation/distr_benchmarks/nlp/4_3.sh b/evaluation/distr_benchmarks/nlp/4_3.sh
new file mode 100755
index 000000000..c20a8cf0d
--- /dev/null
+++ b/evaluation/distr_benchmarks/nlp/4_3.sh
@@ -0,0 +1,19 @@
+#!/bin/bash 
+# tag: bigrams.sh
+# set -e
+
+# Bigrams (contrary to our version, this uses intermediary files)
+IN=${IN:-/nlp/pg/}
+OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/4_3/}
+ENTRIES=${ENTRIES:-1060}
+mkdir -p "$OUT"
+
+for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
+do
+    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' > ${OUT}/${input}.input.words
+    tail +2  ${OUT}/${input}.input.words > ${OUT}/${input}.input.nextwords
+    paste ${OUT}/${input}.input.words ${OUT}/${input}.input.nextwords | sort | uniq -c > ${OUT}/${input}.input.bigrams
+done
+
+echo 'done';
+rm -rf ${OUT}
diff --git a/evaluation/distr_benchmarks/nlp/4_3b.sh b/evaluation/distr_benchmarks/nlp/4_3b.sh
new file mode 100755
index 000000000..1df2cdd20
--- /dev/null
+++ b/evaluation/distr_benchmarks/nlp/4_3b.sh
@@ -0,0 +1,25 @@
+#!/bin/bash 
+#tag: count_trigrams.sh
+# set -e
+
+IN=${IN:-/nlp/pg/}
+OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/4_3b/}
+ENTRIES=${ENTRIES:-1060}
+mkdir -p "$OUT"
+
+pure_func() {
+    input=$1
+    cat > ${OUT}/${input}.words
+    tail +2 ${OUT}/${input}.words > ${OUT}/${input}.nextwords
+    tail +2 ${OUT}/${input}.words > ${OUT}/${input}.nextwords2
+    paste ${OUT}/${input}.words ${OUT}/${input}.nextwords ${OUT}/${input}.nextwords2 |
+    sort | uniq -c 
+}
+export -f pure_func
+for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
+do
+    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | pure_func $input > ${OUT}/${input}.trigrams
+done
+
+echo 'done';
+rm -rf ${OUT}
diff --git a/evaluation/distr_benchmarks/nlp/6_1.sh b/evaluation/distr_benchmarks/nlp/6_1.sh
new file mode 100755
index 000000000..8d8f29220
--- /dev/null
+++ b/evaluation/distr_benchmarks/nlp/6_1.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# tag: trigram_rec
+# set -e
+
+IN=${IN:-/nlp/pg/}
+OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/6_1/}
+ENTRIES=${ENTRIES:-1060}
+mkdir -p "$OUT"
+
+trigrams() {
+    input=$1
+    tr -sc '[A-Z][a-z]' '[\012*]' > ${OUT}/${input}.words
+    tail +2 ${OUT}/${input}.words > ${OUT}/${input}.nextwords
+    tail +3 ${OUT}/${input}.words > ${OUT}/${input}.nextwords2
+    paste ${OUT}/${input}.words ${OUT}/${input}.nextwords ${OUT}/${input}.nextwords2 | sort | uniq -c
+    rm -f ${OUT}/${input}.words ${OUT}/${input}.nextwords ${OUT}/${input}.nextwords2
+}
+export -f trigrams
+
+for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
+do
+    hdfs dfs -cat $IN"/"$input | grep 'the land of' | trigrams ${input} | sort -nr | sed 5q > ${OUT}/${input}.out0
+    hdfs dfs -cat $IN"/"$input | grep 'And he said' | trigrams ${input} | sort -nr | sed 5q > ${OUT}/${input}.out1
+done
+
+echo 'done';
+rm -rf "${OUT}"
diff --git a/evaluation/distr_benchmarks/nlp/6_1_1.sh b/evaluation/distr_benchmarks/nlp/6_1_1.sh
new file mode 100755
index 000000000..784e7b6a9
--- /dev/null
+++ b/evaluation/distr_benchmarks/nlp/6_1_1.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# tag: uppercase_by_token
+# set -e
+
+IN=${IN:-/nlp/pg/}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/6_1_1/}
+ENTRIES=${ENTRIES:-1060}
+mkdir -p "$OUT"
+
+for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
+do
+    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | grep -c '^[A-Z]' > ${OUT}/${input}.out
+done
+
+echo 'done';
+rm -rf ${OUT}
diff --git a/evaluation/distr_benchmarks/nlp/6_1_2.sh b/evaluation/distr_benchmarks/nlp/6_1_2.sh
new file mode 100755
index 000000000..779a0defb
--- /dev/null
+++ b/evaluation/distr_benchmarks/nlp/6_1_2.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# tag: uppercase_by_type
+# set -e
+
+IN=${IN:-/nlp/pg/}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/6_1_2/}
+ENTRIES=${ENTRIES:-1060}
+mkdir -p "$OUT"
+
+for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
+do
+    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort -u | grep -c '^[A-Z]' > ${OUT}/${input}.out
+done
+
+echo 'done';
+rm -rf ${OUT}
diff --git a/evaluation/distr_benchmarks/nlp/6_2.sh b/evaluation/distr_benchmarks/nlp/6_2.sh
new file mode 100755
index 000000000..021207494
--- /dev/null
+++ b/evaluation/distr_benchmarks/nlp/6_2.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# tag: four-letter words
+# set -e
+
+# the original script has both versions
+IN=${IN:-/nlp/pg/}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/6_2/}
+ENTRIES=${ENTRIES:-1060}
+mkdir -p "$OUT"
+
+for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
+do
+    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | grep -c '^....$' > ${OUT}/${input}.out0
+    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort -u | grep -c '^....$'  > ${OUT}/${input}.out1
+done
+
+echo 'done';
+rm -rf "${OUT}"
diff --git a/evaluation/distr_benchmarks/nlp/6_3.sh b/evaluation/distr_benchmarks/nlp/6_3.sh
new file mode 100755
index 000000000..a4f15479c
--- /dev/null
+++ b/evaluation/distr_benchmarks/nlp/6_3.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# tag: words_no_vowels
+# set -e
+
+IN=${IN:-/nlp/pg/}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/6_3/}
+ENTRIES=${ENTRIES:-1060}
+mkdir -p "$OUT"
+
+for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
+do
+    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | grep -vi '[aeiou]' | sort | uniq -c > ${OUT}/${input}.out
+done
+
+echo 'done';
+rm -rf "${OUT}"
diff --git a/evaluation/distr_benchmarks/nlp/6_4.sh b/evaluation/distr_benchmarks/nlp/6_4.sh
new file mode 100755
index 000000000..a7727b0a5
--- /dev/null
+++ b/evaluation/distr_benchmarks/nlp/6_4.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# tag: 1-syllable words
+# set -e
+
+IN=${IN:-/nlp/pg/}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/6_4/}
+ENTRIES=${ENTRIES:-1060}
+mkdir -p "$OUT"
+
+for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
+do
+    hdfs dfs -cat ${IN}/${input} | tr -sc '[A-Z][a-z]' '[\012*]' | grep -i '^[^aeiou]*[aeiou][^aeiou]*$' | sort | uniq -c | sed 5q > ${OUT}/${input}.out
+done
+
+echo 'done';
+rm -rf "${OUT}"
diff --git a/evaluation/distr_benchmarks/nlp/6_5.sh b/evaluation/distr_benchmarks/nlp/6_5.sh
new file mode 100755
index 000000000..413d59696
--- /dev/null
+++ b/evaluation/distr_benchmarks/nlp/6_5.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# tag: 2-syllable words
+# set -e
+
+IN=${IN:-/nlp/pg/}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/6_5/}
+ENTRIES=${ENTRIES:-1060}
+mkdir -p "$OUT"
+
+for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
+do
+    hdfs dfs -cat $IN/$input  | tr -sc '[A-Z][a-z]' ' [\012*]' | grep -i '^[^aeiou]*[aeiou][^aeiou]*[aeiou][^aeiou]$' | sort | uniq -c | sed 5q > ${OUT}${input}.out
+done
+
+echo 'done';
+rm -rf "${OUT}"
diff --git a/evaluation/distr_benchmarks/nlp/6_7.sh b/evaluation/distr_benchmarks/nlp/6_7.sh
new file mode 100755
index 000000000..9396d23c0
--- /dev/null
+++ b/evaluation/distr_benchmarks/nlp/6_7.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# tag: verse_2om_3om_2instances
+# set -e
+# verses with 2 or more, 3 or more, exactly 2 instances of light.
+
+IN=${IN:-/nlp/pg/}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/6_7/}
+ENTRIES=${ENTRIES:-1060}
+mkdir -p "$OUT"
+
+for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
+do
+    hdfs dfs -cat $IN/$input | grep -c 'light.\*light'                                 > ${OUT}/${input}.out0
+    hdfs dfs -cat $IN/$input | grep -c 'light.\*light.\*light'                         > ${OUT}/${input}.out1
+    hdfs dfs -cat $IN/$input | grep 'light.\*light' | grep -vc 'light.\*light.\*light' > ${OUT}/${input}.out2
+done
+
+echo 'done';
+rm -rf ${OUT}
diff --git a/evaluation/distr_benchmarks/nlp/7_1.sh b/evaluation/distr_benchmarks/nlp/7_1.sh
new file mode 100755
index 000000000..7f3f81518
--- /dev/null
+++ b/evaluation/distr_benchmarks/nlp/7_1.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# tag: count_morphs
+# set -e
+
+IN=${IN:-/nlp/pg/}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/7_1/}
+ENTRIES=${ENTRIES:-1060}
+mkdir -p "$OUT"
+
+for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
+do
+    hdfs dfs -cat $IN/$input | sed 's/ly$/-ly/g' | sed 's/ .*//g' | sort | uniq -c > ${OUT}/${input}.out
+done
+
+echo 'done';
+rm ${OUT}
diff --git a/evaluation/distr_benchmarks/nlp/7_2.sh b/evaluation/distr_benchmarks/nlp/7_2.sh
new file mode 100755
index 000000000..7ba0e1b38
--- /dev/null
+++ b/evaluation/distr_benchmarks/nlp/7_2.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# set -e
+# tag: count_consonant_sequences
+
+IN=${IN:-/nlp/pg/}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/7_2/}
+ENTRIES=${ENTRIES:-1060}
+mkdir -p "$OUT"
+
+for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
+do
+    hdfs dfs -cat $IN/$input | tr '[a-z]' '[A-Z]' | tr -sc 'BCDFGHJKLMNPQRSTVWXYZ' '[\012*]' | sort | uniq -c > ${OUT}/${input}.out
+done
+
+echo 'done';
+rm -rf ${OUT}
diff --git a/evaluation/distr_benchmarks/nlp/8.2_1.sh b/evaluation/distr_benchmarks/nlp/8.2_1.sh
new file mode 100755
index 000000000..94bc2a383
--- /dev/null
+++ b/evaluation/distr_benchmarks/nlp/8.2_1.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# tag: vowel_sequences_gr_1K.sh
+# set -e
+
+IN=${IN:-/nlp/pg/}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/8.2_1/}
+ENTRIES=${ENTRIES:-1060}
+mkdir -p "$OUT"
+
+for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
+do
+    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | tr -sc 'AEIOUaeiou' '[\012*]' | sort | uniq -c | awk "\$1 >= 1000" > ${OUT}/${input}.out
+done
+
+echo 'done';
+rm -rf "${OUT}"
diff --git a/evaluation/distr_benchmarks/nlp/8.2_2.sh b/evaluation/distr_benchmarks/nlp/8.2_2.sh
new file mode 100755
index 000000000..3ac31555d
--- /dev/null
+++ b/evaluation/distr_benchmarks/nlp/8.2_2.sh
@@ -0,0 +1,26 @@
+#!/bin/bash 
+# tag: bigrams_appear_twice.sh
+# set -e
+
+# Calculate the bigrams (based on 4_3.sh script)
+IN=${IN:-/nlp/pg/}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/8.2_2/}
+ENTRIES=${ENTRIES:-1060}
+mkdir -p "$OUT"
+
+pure_func() {
+    input=$1
+    cat > ${OUT}/${input}.input.words
+    tail +2 ${OUT}/${input}.input.words > ${OUT}/${input}.input.nextwords
+    paste ${OUT}/${input}.input.words ${OUT}/${input}.input.nextwords | sort | uniq -c > ${OUT}/${input}.input.bigrams
+    awk "\$1 == 2 {print \$2, \$3}" ${OUT}/${input}.input.bigrams
+}
+
+export -f pure_func
+for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
+do
+    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | pure_func $input > ${OUT}/${input}.out
+done
+
+echo 'done';
+rm -rf "${OUT}"
diff --git a/evaluation/distr_benchmarks/nlp/8.3_2.sh b/evaluation/distr_benchmarks/nlp/8.3_2.sh
new file mode 100755
index 000000000..a60da077c
--- /dev/null
+++ b/evaluation/distr_benchmarks/nlp/8.3_2.sh
@@ -0,0 +1,24 @@
+#!/bin/bash 
+# tag: find_anagrams.sh
+# set -e
+
+IN=${IN:-/nlp/pg/}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/8.3_2/}
+ENTRIES=${ENTRIES:-1060}
+mkdir -p "$OUT"
+
+pure_func() {
+    input=$1
+    sort -u > ${OUT}/${input}.types
+    rev < ${OUT}/${input}.types > ${OUT}/${input}.types.rev
+    sort ${OUT}/${input}.types ${OUT}/${input}.types.rev | uniq -c | awk "\$1 >= 2 {print \$2}"
+}
+
+export -f pure_func
+for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
+do
+    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | pure_func $input > ${OUT}/${input}.out
+done
+
+echo 'done';
+rm -rf "${OUT}"
diff --git a/evaluation/distr_benchmarks/nlp/8.3_3.sh b/evaluation/distr_benchmarks/nlp/8.3_3.sh
new file mode 100755
index 000000000..e397fb939
--- /dev/null
+++ b/evaluation/distr_benchmarks/nlp/8.3_3.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# tag: compare_exodus_genesis.sh
+# set -e
+
+IN=${IN:-/nlp/pg/}
+INPUT2=${INPUT2:-$PASH_TOP/evaluation/distr_benchmarks/nlp/input/exodus}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/8.3_3/}
+ENTRIES=${ENTRIES:-1060}
+mkdir -p $OUT
+
+pure_func() {
+    input=$1
+    cat > ${OUT}/${input}1.types
+    hdfs dfs -cat  ${INPUT2} | tr -sc '[A-Z][a-z]' '[\012*]' | sort -u > ${OUT}/${input}2.types
+    sort $OUT/${input}1.types ${OUT}/${input}2.types ${OUT}/${input}2.types | uniq -c | head 
+
+}
+export -f pure_func
+for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
+do
+    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort -u | pure_func $input > ${OUT}/${input}.out
+done
+
+echo 'done';
+rm -rf "${OUT}"
diff --git a/evaluation/distr_benchmarks/nlp/8_1.sh b/evaluation/distr_benchmarks/nlp/8_1.sh
new file mode 100755
index 000000000..b274e7946
--- /dev/null
+++ b/evaluation/distr_benchmarks/nlp/8_1.sh
@@ -0,0 +1,23 @@
+#!/bin/bash 
+# tag: sort_words_by_num_of_syllables
+# set -e
+
+IN=${IN:-/nlp/pg/}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/8.1/}
+ENTRIES=${ENTRIES:-1060}
+mkdir -p "$OUT"
+
+pure_func() {
+    input=$1
+    cat > ${OUT}/${input}.words
+    tr -sc '[AEIOUaeiou\012]' ' ' < ${OUT}/${input}.words | awk '{print NF}' > ${OUT}/${input}.syl
+    paste ${OUT}/${input}.syl ${OUT}/${input}.words | sort -nr | sed 5q
+}
+export -f pure_func
+for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
+do
+    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort -u | pure_func $input > ${OUT}/${input}.out
+done
+
+echo 'done';
+rm -rf "${OUT}"
diff --git a/evaluation/distr_benchmarks/nlp/input/.gitignore b/evaluation/distr_benchmarks/nlp/input/.gitignore
new file mode 100644
index 000000000..d815bc4e8
--- /dev/null
+++ b/evaluation/distr_benchmarks/nlp/input/.gitignore
@@ -0,0 +1,4 @@
+*
+!pipelines.sh
+!setup.sh
+!.gitignore
diff --git a/evaluation/distr_benchmarks/nlp/input/setup.sh b/evaluation/distr_benchmarks/nlp/input/setup.sh
new file mode 100755
index 000000000..1875bbb8a
--- /dev/null
+++ b/evaluation/distr_benchmarks/nlp/input/setup.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)}
+
+[[ "$1" == "-c" ]] && { rm -rf genesis exodus pg; exit; }
+
+setup_dataset() {
+  if [ ! -f ./genesis ]; then
+      curl -sf https://www.gutenberg.org/cache/epub/8001/pg8001.txt > genesis
+      "$PASH_TOP/scripts/append_nl_if_not.sh" genesis
+  fi 
+
+  if [ ! -f ./exodus ]; then
+    curl -sf https://www.gutenberg.org/files/33420/33420-0.txt > exodus
+    "$PASH_TOP/scripts/append_nl_if_not.sh" exodus
+  fi
+
+  if [ ! -e ./pg ]; then
+    mkdir pg
+    cd pg
+  if [[ "$1" == "--gen-full" ]]; then
+    echo 'N.b.: download/extraction will take about 10min'
+    wget ndr.md/data/pg.tar.xz
+    if [ $? -ne 0 ]; then
+		cat <<-'EOF' | sed 's/^ *//'
+		Downloading input dataset failed, thus need to manually rsync all books from  project gutenberg:
+		rsync -av --del --prune-empty-dirs --include='*.txt' --include='*/' --exclude='*' ftp@ftp.ibiblio.org::gutenberg .
+		please contact the pash developers pash-devs@googlegroups.com
+		EOF
+    exit 1
+    fi
+    cat pg.tar.xz | tar -xJ
+  else
+    wget http://pac-n4.csail.mit.edu:81/pash_data/nlp.zip
+    unzip nlp.zip
+    mv data/* .
+    rm nlp.zip data -rf
+  fi
+    for f in *.txt; do
+      "$PASH_TOP/scripts/append_nl_if_not.sh" $f
+    done
+    cd ..
+  fi
+
+  # Put files in hdfs
+  hdfs dfs -mkdir /nlp
+  hdfs dfs -put exodus /nlp/exodus
+  hdfs dfs -put genesis /nlp/genesis
+  hdfs dfs -put pg /nlp/pg
+}
+
+source_var() {
+  if [[ "$1" == "--small" ]]; then
+    export ENTRIES=40
+  else
+    # 1% of the input
+    export ENTRIES=1060
+  fi
+}

From 3f65e428be9ade33e23ff95af4581d2baffce8b0 Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Mon, 30 May 2022 13:56:05 -0400
Subject: [PATCH 11/37] replace non parallelizable tr with parallelizable
 variation

---
 evaluation/distr_benchmarks/nlp/1_1.sh         | 4 ++--
 evaluation/distr_benchmarks/nlp/2_1.sh         | 4 ++--
 evaluation/distr_benchmarks/nlp/2_2.sh         | 2 +-
 evaluation/distr_benchmarks/nlp/3_1.sh         | 4 ++--
 evaluation/distr_benchmarks/nlp/3_2.sh         | 4 ++--
 evaluation/distr_benchmarks/nlp/3_3.sh         | 4 ++--
 evaluation/distr_benchmarks/nlp/4_3.sh         | 4 ++--
 evaluation/distr_benchmarks/nlp/4_3b.sh        | 4 ++--
 evaluation/distr_benchmarks/nlp/6_1.sh         | 2 +-
 evaluation/distr_benchmarks/nlp/6_1_1.sh       | 2 +-
 evaluation/distr_benchmarks/nlp/6_1_2.sh       | 2 +-
 evaluation/distr_benchmarks/nlp/6_2.sh         | 4 ++--
 evaluation/distr_benchmarks/nlp/6_3.sh         | 2 +-
 evaluation/distr_benchmarks/nlp/6_4.sh         | 2 +-
 evaluation/distr_benchmarks/nlp/6_5.sh         | 2 +-
 evaluation/distr_benchmarks/nlp/8.2_1.sh       | 2 +-
 evaluation/distr_benchmarks/nlp/8.2_2.sh       | 2 +-
 evaluation/distr_benchmarks/nlp/8.3_2.sh       | 2 +-
 evaluation/distr_benchmarks/nlp/8.3_3.sh       | 2 +-
 evaluation/distr_benchmarks/nlp/8_1.sh         | 2 +-
 evaluation/distr_benchmarks/oneliners/top-n.sh | 2 +-
 evaluation/distr_benchmarks/oneliners/wf.sh    | 4 ++--
 22 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/evaluation/distr_benchmarks/nlp/1_1.sh b/evaluation/distr_benchmarks/nlp/1_1.sh
index 7ff63c21a..50aa77bbd 100755
--- a/evaluation/distr_benchmarks/nlp/1_1.sh
+++ b/evaluation/distr_benchmarks/nlp/1_1.sh
@@ -2,13 +2,13 @@
 # tag: count_words
 
 IN=${IN:-/nlp/pg/}
-OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/1_1/}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/1_1/}
 ENTRIES=${ENTRIES:-1060}
 mkdir -p "$OUT"
 
 for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
 do
-    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort | uniq -c > ${OUT}/${input}.out
+    hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | sort | uniq -c > ${OUT}/${input}.out
 done
 
 echo 'done';
diff --git a/evaluation/distr_benchmarks/nlp/2_1.sh b/evaluation/distr_benchmarks/nlp/2_1.sh
index 4e35100a8..b89d2f48f 100755
--- a/evaluation/distr_benchmarks/nlp/2_1.sh
+++ b/evaluation/distr_benchmarks/nlp/2_1.sh
@@ -4,13 +4,13 @@
 
 # Merge upper and lower counts
 IN=${IN:-/nlp/pg/}
-OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/2_1/}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/2_1/}
 ENTRIES=${ENTRIES:-1060}
 mkdir -p "$OUT"
 
 for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
 do
-    hdfs dfs -cat $IN/$input | tr '[a-z]' '[A-Z]' | tr -sc '[A-Z]' '[\012*]' | sort | uniq -c > ${OUT}/${input}.out
+    hdfs dfs -cat $IN/$input | tr '[a-z]' '[A-Z]' |  tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | sort | uniq -c > ${OUT}/${input}.out
 done
 
 echo 'done';
diff --git a/evaluation/distr_benchmarks/nlp/2_2.sh b/evaluation/distr_benchmarks/nlp/2_2.sh
index 8111b23aa..39d8e9b0c 100755
--- a/evaluation/distr_benchmarks/nlp/2_2.sh
+++ b/evaluation/distr_benchmarks/nlp/2_2.sh
@@ -3,7 +3,7 @@
 # set -e 
 
 IN=${IN:-/nlp/pg/}
-OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/2_2/}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/2_2/}
 ENTRIES=${ENTRIES:-1060}
 mkdir -p "$OUT"
 
diff --git a/evaluation/distr_benchmarks/nlp/3_1.sh b/evaluation/distr_benchmarks/nlp/3_1.sh
index 6082bb1c6..2a58b2861 100755
--- a/evaluation/distr_benchmarks/nlp/3_1.sh
+++ b/evaluation/distr_benchmarks/nlp/3_1.sh
@@ -3,13 +3,13 @@
 # set -e
 
 IN=${IN:-/nlp/pg/}
-OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/3_1/}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/3_1/}
 ENTRIES=${ENTRIES:-1060}
 mkdir -p "$OUT"
 
 for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
 do
-    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort | uniq -c | sort -nr > ${OUT}/${input}.out
+    hdfs dfs -cat $IN/$input |  tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -nr > ${OUT}/${input}.out
 done
 
 echo 'done';
diff --git a/evaluation/distr_benchmarks/nlp/3_2.sh b/evaluation/distr_benchmarks/nlp/3_2.sh
index 571481d1d..51d55ffdc 100755
--- a/evaluation/distr_benchmarks/nlp/3_2.sh
+++ b/evaluation/distr_benchmarks/nlp/3_2.sh
@@ -3,13 +3,13 @@
 # set -e
 
 IN=${IN:-/nlp/pg/}
-OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/3_2/}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/3_2/}
 ENTRIES=${ENTRIES:-1060}
 mkdir -p "$OUT"
 
 for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
 do
-    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort | uniq -c | sort -f > ${OUT}/${input}
+    hdfs dfs -cat $IN/$input |  tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -f > ${OUT}/${input}
 done
 
 echo 'done';
diff --git a/evaluation/distr_benchmarks/nlp/3_3.sh b/evaluation/distr_benchmarks/nlp/3_3.sh
index ff67ea089..909e5a4bd 100755
--- a/evaluation/distr_benchmarks/nlp/3_3.sh
+++ b/evaluation/distr_benchmarks/nlp/3_3.sh
@@ -3,13 +3,13 @@
 # set -e
 
 IN=${IN:-/nlp/pg/}
-OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/3_3/}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/3_3/}
 ENTRIES=${ENTRIES:-1060}
 mkdir -p "$OUT"
 
 for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
 do
-    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort | uniq -c | rev | sort | rev > ${OUT}/${input}.out
+    hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | rev | sort | rev > ${OUT}/${input}.out
 done
 
 echo 'done';
diff --git a/evaluation/distr_benchmarks/nlp/4_3.sh b/evaluation/distr_benchmarks/nlp/4_3.sh
index c20a8cf0d..100c78918 100755
--- a/evaluation/distr_benchmarks/nlp/4_3.sh
+++ b/evaluation/distr_benchmarks/nlp/4_3.sh
@@ -4,13 +4,13 @@
 
 # Bigrams (contrary to our version, this uses intermediary files)
 IN=${IN:-/nlp/pg/}
-OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/4_3/}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/4_3/}
 ENTRIES=${ENTRIES:-1060}
 mkdir -p "$OUT"
 
 for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
 do
-    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' > ${OUT}/${input}.input.words
+    hdfs dfs -cat $IN/$input |  tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" > ${OUT}/${input}.input.words
     tail +2  ${OUT}/${input}.input.words > ${OUT}/${input}.input.nextwords
     paste ${OUT}/${input}.input.words ${OUT}/${input}.input.nextwords | sort | uniq -c > ${OUT}/${input}.input.bigrams
 done
diff --git a/evaluation/distr_benchmarks/nlp/4_3b.sh b/evaluation/distr_benchmarks/nlp/4_3b.sh
index 1df2cdd20..a77f9dd26 100755
--- a/evaluation/distr_benchmarks/nlp/4_3b.sh
+++ b/evaluation/distr_benchmarks/nlp/4_3b.sh
@@ -3,7 +3,7 @@
 # set -e
 
 IN=${IN:-/nlp/pg/}
-OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/4_3b/}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/4_3b/}
 ENTRIES=${ENTRIES:-1060}
 mkdir -p "$OUT"
 
@@ -18,7 +18,7 @@ pure_func() {
 export -f pure_func
 for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
 do
-    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | pure_func $input > ${OUT}/${input}.trigrams
+    hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | pure_func $input > ${OUT}/${input}.trigrams
 done
 
 echo 'done';
diff --git a/evaluation/distr_benchmarks/nlp/6_1.sh b/evaluation/distr_benchmarks/nlp/6_1.sh
index 8d8f29220..39c328c20 100755
--- a/evaluation/distr_benchmarks/nlp/6_1.sh
+++ b/evaluation/distr_benchmarks/nlp/6_1.sh
@@ -3,7 +3,7 @@
 # set -e
 
 IN=${IN:-/nlp/pg/}
-OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/6_1/}
+OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/6_1/}
 ENTRIES=${ENTRIES:-1060}
 mkdir -p "$OUT"
 
diff --git a/evaluation/distr_benchmarks/nlp/6_1_1.sh b/evaluation/distr_benchmarks/nlp/6_1_1.sh
index 784e7b6a9..c92af69ee 100755
--- a/evaluation/distr_benchmarks/nlp/6_1_1.sh
+++ b/evaluation/distr_benchmarks/nlp/6_1_1.sh
@@ -9,7 +9,7 @@ mkdir -p "$OUT"
 
 for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
 do
-    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | grep -c '^[A-Z]' > ${OUT}/${input}.out
+    hdfs dfs -cat $IN/$input |  tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | grep -c '^[A-Z]' > ${OUT}/${input}.out
 done
 
 echo 'done';
diff --git a/evaluation/distr_benchmarks/nlp/6_1_2.sh b/evaluation/distr_benchmarks/nlp/6_1_2.sh
index 779a0defb..72041d3e1 100755
--- a/evaluation/distr_benchmarks/nlp/6_1_2.sh
+++ b/evaluation/distr_benchmarks/nlp/6_1_2.sh
@@ -9,7 +9,7 @@ mkdir -p "$OUT"
 
 for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
 do
-    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort -u | grep -c '^[A-Z]' > ${OUT}/${input}.out
+    hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | sort -u | grep -c '^[A-Z]' > ${OUT}/${input}.out
 done
 
 echo 'done';
diff --git a/evaluation/distr_benchmarks/nlp/6_2.sh b/evaluation/distr_benchmarks/nlp/6_2.sh
index 021207494..5227daffe 100755
--- a/evaluation/distr_benchmarks/nlp/6_2.sh
+++ b/evaluation/distr_benchmarks/nlp/6_2.sh
@@ -10,8 +10,8 @@ mkdir -p "$OUT"
 
 for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
 do
-    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | grep -c '^....$' > ${OUT}/${input}.out0
-    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort -u | grep -c '^....$'  > ${OUT}/${input}.out1
+    hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | grep -c '^....$' > ${OUT}/${input}.out0
+    hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | sort -u | grep -c '^....$'  > ${OUT}/${input}.out1
 done
 
 echo 'done';
diff --git a/evaluation/distr_benchmarks/nlp/6_3.sh b/evaluation/distr_benchmarks/nlp/6_3.sh
index a4f15479c..699c3eafd 100755
--- a/evaluation/distr_benchmarks/nlp/6_3.sh
+++ b/evaluation/distr_benchmarks/nlp/6_3.sh
@@ -9,7 +9,7 @@ mkdir -p "$OUT"
 
 for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
 do
-    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | grep -vi '[aeiou]' | sort | uniq -c > ${OUT}/${input}.out
+    hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | grep -vi '[aeiou]' | sort | uniq -c > ${OUT}/${input}.out
 done
 
 echo 'done';
diff --git a/evaluation/distr_benchmarks/nlp/6_4.sh b/evaluation/distr_benchmarks/nlp/6_4.sh
index a7727b0a5..bd47e042c 100755
--- a/evaluation/distr_benchmarks/nlp/6_4.sh
+++ b/evaluation/distr_benchmarks/nlp/6_4.sh
@@ -9,7 +9,7 @@ mkdir -p "$OUT"
 
 for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
 do
-    hdfs dfs -cat ${IN}/${input} | tr -sc '[A-Z][a-z]' '[\012*]' | grep -i '^[^aeiou]*[aeiou][^aeiou]*$' | sort | uniq -c | sed 5q > ${OUT}/${input}.out
+    hdfs dfs -cat ${IN}/${input} | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | grep -i '^[^aeiou]*[aeiou][^aeiou]*$' | sort | uniq -c | sed 5q > ${OUT}/${input}.out
 done
 
 echo 'done';
diff --git a/evaluation/distr_benchmarks/nlp/6_5.sh b/evaluation/distr_benchmarks/nlp/6_5.sh
index 413d59696..90d65e4a9 100755
--- a/evaluation/distr_benchmarks/nlp/6_5.sh
+++ b/evaluation/distr_benchmarks/nlp/6_5.sh
@@ -9,7 +9,7 @@ mkdir -p "$OUT"
 
 for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
 do
-    hdfs dfs -cat $IN/$input  | tr -sc '[A-Z][a-z]' ' [\012*]' | grep -i '^[^aeiou]*[aeiou][^aeiou]*[aeiou][^aeiou]$' | sort | uniq -c | sed 5q > ${OUT}${input}.out
+    hdfs dfs -cat $IN/$input  | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | grep -i '^[^aeiou]*[aeiou][^aeiou]*[aeiou][^aeiou]$' | sort | uniq -c | sed 5q > ${OUT}${input}.out
 done
 
 echo 'done';
diff --git a/evaluation/distr_benchmarks/nlp/8.2_1.sh b/evaluation/distr_benchmarks/nlp/8.2_1.sh
index 94bc2a383..f03a56985 100755
--- a/evaluation/distr_benchmarks/nlp/8.2_1.sh
+++ b/evaluation/distr_benchmarks/nlp/8.2_1.sh
@@ -9,7 +9,7 @@ mkdir -p "$OUT"
 
 for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
 do
-    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | tr -sc 'AEIOUaeiou' '[\012*]' | sort | uniq -c | awk "\$1 >= 1000" > ${OUT}/${input}.out
+    hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | tr -sc 'AEIOUaeiou' '[\012*]' | sort | uniq -c | awk "\$1 >= 1000" > ${OUT}/${input}.out
 done
 
 echo 'done';
diff --git a/evaluation/distr_benchmarks/nlp/8.2_2.sh b/evaluation/distr_benchmarks/nlp/8.2_2.sh
index 3ac31555d..dc6ec685b 100755
--- a/evaluation/distr_benchmarks/nlp/8.2_2.sh
+++ b/evaluation/distr_benchmarks/nlp/8.2_2.sh
@@ -19,7 +19,7 @@ pure_func() {
 export -f pure_func
 for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
 do
-    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | pure_func $input > ${OUT}/${input}.out
+    hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | pure_func $input > ${OUT}/${input}.out
 done
 
 echo 'done';
diff --git a/evaluation/distr_benchmarks/nlp/8.3_2.sh b/evaluation/distr_benchmarks/nlp/8.3_2.sh
index a60da077c..47454d3b8 100755
--- a/evaluation/distr_benchmarks/nlp/8.3_2.sh
+++ b/evaluation/distr_benchmarks/nlp/8.3_2.sh
@@ -17,7 +17,7 @@ pure_func() {
 export -f pure_func
 for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
 do
-    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | pure_func $input > ${OUT}/${input}.out
+    hdfs dfs -cat $IN/$input |  tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | pure_func $input > ${OUT}/${input}.out
 done
 
 echo 'done';
diff --git a/evaluation/distr_benchmarks/nlp/8.3_3.sh b/evaluation/distr_benchmarks/nlp/8.3_3.sh
index e397fb939..22dfe96c3 100755
--- a/evaluation/distr_benchmarks/nlp/8.3_3.sh
+++ b/evaluation/distr_benchmarks/nlp/8.3_3.sh
@@ -18,7 +18,7 @@ pure_func() {
 export -f pure_func
 for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
 do
-    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort -u | pure_func $input > ${OUT}/${input}.out
+    hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | sort -u | pure_func $input > ${OUT}/${input}.out
 done
 
 echo 'done';
diff --git a/evaluation/distr_benchmarks/nlp/8_1.sh b/evaluation/distr_benchmarks/nlp/8_1.sh
index b274e7946..7973476ba 100755
--- a/evaluation/distr_benchmarks/nlp/8_1.sh
+++ b/evaluation/distr_benchmarks/nlp/8_1.sh
@@ -16,7 +16,7 @@ pure_func() {
 export -f pure_func
 for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
 do
-    hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort -u | pure_func $input > ${OUT}/${input}.out
+    hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | sort -u | pure_func $input > ${OUT}/${input}.out
 done
 
 echo 'done';
diff --git a/evaluation/distr_benchmarks/oneliners/top-n.sh b/evaluation/distr_benchmarks/oneliners/top-n.sh
index ac6fbb50e..c2f7f2b21 100755
--- a/evaluation/distr_benchmarks/oneliners/top-n.sh
+++ b/evaluation/distr_benchmarks/oneliners/top-n.sh
@@ -4,5 +4,5 @@
 
 IN=${IN:-/1G.txt}
 
-hdfs dfs -cat $IN | tr -cs A-Za-z '\n' | tr A-Z a-z | sort | uniq -c | sort -rn | sed 100q
+hdfs dfs -cat $IN | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | tr A-Z a-z | sort | uniq -c | sort -rn | sed 100q
 
diff --git a/evaluation/distr_benchmarks/oneliners/wf.sh b/evaluation/distr_benchmarks/oneliners/wf.sh
index a8a885775..eea9c2c58 100755
--- a/evaluation/distr_benchmarks/oneliners/wf.sh
+++ b/evaluation/distr_benchmarks/oneliners/wf.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 # Calculate the frequency of each word in the document, and sort by frequency
 
-IN=${IN:-/10M.txt}
+IN=${IN:-/rep3_10M.txt}
 
-hdfs dfs -cat $IN |  tr -cs A-Za-z '\n' | tr A-Z a-z | sort | uniq -c | sort -rn 
+hdfs dfs -cat $IN | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | tr A-Z a-z | sort | uniq -c | sort -rn 

From 9d11dd64e68c6a6767d06265d8955226780b1d0b Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Mon, 30 May 2022 15:14:42 -0400
Subject: [PATCH 12/37] nlp eval script

---
 evaluation/distr_benchmarks/nlp/8.3_3.sh     |   2 +-
 evaluation/distr_benchmarks/nlp/run.distr.sh | 112 +++++++++++++++++++
 2 files changed, 113 insertions(+), 1 deletion(-)
 create mode 100755 evaluation/distr_benchmarks/nlp/run.distr.sh

diff --git a/evaluation/distr_benchmarks/nlp/8.3_3.sh b/evaluation/distr_benchmarks/nlp/8.3_3.sh
index 22dfe96c3..71b873a21 100755
--- a/evaluation/distr_benchmarks/nlp/8.3_3.sh
+++ b/evaluation/distr_benchmarks/nlp/8.3_3.sh
@@ -3,7 +3,7 @@
 # set -e
 
 IN=${IN:-/nlp/pg/}
-INPUT2=${INPUT2:-$PASH_TOP/evaluation/distr_benchmarks/nlp/input/exodus}
+INPUT2=${INPUT2:-/nlp/exodus}
 OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/8.3_3/}
 ENTRIES=${ENTRIES:-1060}
 mkdir -p $OUT
diff --git a/evaluation/distr_benchmarks/nlp/run.distr.sh b/evaluation/distr_benchmarks/nlp/run.distr.sh
new file mode 100755
index 000000000..48dba3fa6
--- /dev/null
+++ b/evaluation/distr_benchmarks/nlp/run.distr.sh
@@ -0,0 +1,112 @@
+PASH_FLAGS='--width 8 --r_split --parallel_pipelines'
+export TIMEFORMAT=%R
+
+if [[ "$1" == "--small" ]]; then
+    echo "Using small input"
+    export ENTRIES=40
+else
+    echo "Using full input"
+    export ENTRIES=1060
+fi
+
+names_scripts=(
+    "1syllable_words;6_4"
+    "2syllable_words;6_5"
+    "4letter_words;6_2"
+    "bigrams_appear_twice;8.2_2"
+    "bigrams;4_3"
+    "compare_exodus_genesis;8.3_3"
+    "count_consonant_seq;7_2"
+    # "count_morphs;7_1"
+    "count_trigrams;4_3b"
+    "count_vowel_seq;2_2"
+    "count_words;1_1"
+    "find_anagrams;8.3_2"
+    "merge_upper;2_1"
+    "sort;3_1"
+    "sort_words_by_folding;3_2"
+    "sort_words_by_num_of_syllables;8_1"
+    "sort_words_by_rhyming;3_3"
+    # "trigram_rec;6_1"
+    "uppercase_by_token;6_1_1"
+    "uppercase_by_type;6_1_2"
+    "verses_2om_3om_2instances;6_7"
+    "vowel_sequencies_gr_1K;8.2_1"
+    "words_no_vowels;6_3"
+  )
+
+bash_nlp(){
+  outputs_dir="outputs"
+  rep=${1:-rep3}
+  times_file=$rep"_seq.res"
+  outputs_suffix=$rep"_seq.out"
+
+  mkdir -p "$outputs_dir"
+
+  touch "$times_file"
+  echo executing Unix-for-nlp $(date) | tee -a "$times_file"
+  echo '' >> "$times_file"
+
+  for name_script in ${names_scripts[@]}
+  do
+    IFS=";" read -r -a name_script_parsed <<< "${name_script}"
+    name="${name_script_parsed[0]}"
+    script="${name_script_parsed[1]}"
+    printf -v pad %30s
+    padded_script="${name}.sh:${pad}"
+    padded_script=${padded_script:0:30}
+
+    outputs_file="${outputs_dir}/${script}.${outputs_suffix}"
+
+    echo "${padded_script}" $({ time ./${script}.sh > "$outputs_file"; } 2>&1) | tee -a "$times_file"
+  done
+  cd ..
+}
+
+nlp_pash(){
+  flags=${1:-$PASH_FLAGS}
+  prefix=${2:-par}
+  rep=${3:-rep3}
+  prefix=$prefix\_$rep
+
+  times_file="$prefix.res"
+  outputs_suffix="$prefix.out"
+  time_suffix="$prefix.time"
+  outputs_dir="outputs"
+  pash_logs_dir="pash_logs_$prefix"
+
+  mkdir -p "$outputs_dir"
+  mkdir -p "$pash_logs_dir"
+
+  touch "$times_file"
+  echo executing Unix-for-nlp with pash $(date) | tee -a "$times_file"
+  echo '' >> "$times_file"
+
+  for name_script in ${names_scripts[@]}
+  do
+    IFS=";" read -r -a name_script_parsed <<< "${name_script}"
+    name="${name_script_parsed[0]}"
+    script="${name_script_parsed[1]}"
+    printf -v pad %30s
+    padded_script="${name}.sh:${pad}"
+    padded_script=${padded_script:0:30}
+
+    outputs_file="${outputs_dir}/${script}.${outputs_suffix}"
+    pash_log="${pash_logs_dir}/${script}.pash.log"
+    single_time_file="${outputs_dir}/${script}.${time_suffix}"
+
+    echo -n "${padded_script}" | tee -a "$times_file"
+    { time "$PASH_TOP/pa.sh" $PASH_FLAGS --log_file "${pash_log}" ${script}.sh > "$outputs_file"; } 2> "${single_time_file}"
+    cat "${single_time_file}" | tee -a "$times_file"
+  done
+  cd ..
+}
+
+# bash_nlp "rep1"
+bash_nlp "rep3"
+
+# nlp_pash "$PASH_FLAGS" "par" "rep1"
+nlp_pash "$PASH_FLAGS --parallel_pipelines_limit 6" "par" "rep3"
+
+# nlp_pash "$PASH_FLAGS --distributed_exec" "distr" "rep1"
+nlp_pash "$PASH_FLAGS --distributed_exec --parallel_pipelines_limit 24" "distr" "rep3"

From eecff28b6165143d6e4c24db90609dc2d7c90f35 Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Mon, 30 May 2022 15:41:44 -0400
Subject: [PATCH 13/37] fix incorrect flags

---
 evaluation/distr_benchmarks/nlp/run.distr.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/evaluation/distr_benchmarks/nlp/run.distr.sh b/evaluation/distr_benchmarks/nlp/run.distr.sh
index 48dba3fa6..58b84481c 100755
--- a/evaluation/distr_benchmarks/nlp/run.distr.sh
+++ b/evaluation/distr_benchmarks/nlp/run.distr.sh
@@ -96,7 +96,7 @@ nlp_pash(){
     single_time_file="${outputs_dir}/${script}.${time_suffix}"
 
     echo -n "${padded_script}" | tee -a "$times_file"
-    { time "$PASH_TOP/pa.sh" $PASH_FLAGS --log_file "${pash_log}" ${script}.sh > "$outputs_file"; } 2> "${single_time_file}"
+    { time "$PASH_TOP/pa.sh" $flags --log_file "${pash_log}" ${script}.sh > "$outputs_file"; } 2> "${single_time_file}"
     cat "${single_time_file}" | tee -a "$times_file"
   done
   cd ..
@@ -106,7 +106,7 @@ nlp_pash(){
 bash_nlp "rep3"
 
 # nlp_pash "$PASH_FLAGS" "par" "rep1"
-nlp_pash "$PASH_FLAGS --parallel_pipelines_limit 6" "par" "rep3"
+nlp_pash "$PASH_FLAGS --parallel_pipelines_limit 8" "par" "rep3"
 
 # nlp_pash "$PASH_FLAGS --distributed_exec" "distr" "rep1"
 nlp_pash "$PASH_FLAGS --distributed_exec --parallel_pipelines_limit 24" "distr" "rep3"

From 4749aa4df02ac050629fc5c5d7f075170939e6c3 Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Thu, 2 Jun 2022 09:54:45 -0400
Subject: [PATCH 14/37] fixed small issues in eval scripts

---
 evaluation/distr_benchmarks/nlp/run.distr.sh  | 42 +++++++++----------
 .../distr_benchmarks/oneliners/run.distr.sh   | 14 +++----
 evaluation/distr_benchmarks/run_all.sh        |  5 +++
 3 files changed, 32 insertions(+), 29 deletions(-)
 create mode 100755 evaluation/distr_benchmarks/run_all.sh

diff --git a/evaluation/distr_benchmarks/nlp/run.distr.sh b/evaluation/distr_benchmarks/nlp/run.distr.sh
index 58b84481c..b693d5065 100755
--- a/evaluation/distr_benchmarks/nlp/run.distr.sh
+++ b/evaluation/distr_benchmarks/nlp/run.distr.sh
@@ -1,12 +1,12 @@
-PASH_FLAGS='--width 8 --r_split --parallel_pipelines'
+PASH_FLAGS='--width 8 --r_split'
 export TIMEFORMAT=%R
 
-if [[ "$1" == "--small" ]]; then
-    echo "Using small input"
-    export ENTRIES=40
+if [[ "$1" == "--full" ]]; then
+   echo "Using full input"
+  export ENTRIES=1060
 else
-    echo "Using full input"
-    export ENTRIES=1060
+  echo "Using small input"
+  export ENTRIES=120
 fi
 
 names_scripts=(
@@ -14,7 +14,7 @@ names_scripts=(
     "2syllable_words;6_5"
     "4letter_words;6_2"
     "bigrams_appear_twice;8.2_2"
-    "bigrams;4_3"
+    # "bigrams;4_3"
     "compare_exodus_genesis;8.3_3"
     "count_consonant_seq;7_2"
     # "count_morphs;7_1"
@@ -37,14 +37,14 @@ names_scripts=(
 
 bash_nlp(){
   outputs_dir="outputs"
-  rep=${1:-rep3}
-  times_file=$rep"_seq.res"
-  outputs_suffix=$rep"_seq.out"
+  times_file="seq.res"
+  outputs_suffix="seq.out"
 
   mkdir -p "$outputs_dir"
 
   touch "$times_file"
-  echo executing Unix-for-nlp $(date) | tee -a "$times_file"
+  cat "$times_file" > "$times_file".d
+  echo executing Unix-for-nlp $(date) | tee "$times_file"
   echo '' >> "$times_file"
 
   for name_script in ${names_scripts[@]}
@@ -60,14 +60,11 @@ bash_nlp(){
 
     echo "${padded_script}" $({ time ./${script}.sh > "$outputs_file"; } 2>&1) | tee -a "$times_file"
   done
-  cd ..
 }
 
 nlp_pash(){
   flags=${1:-$PASH_FLAGS}
   prefix=${2:-par}
-  rep=${3:-rep3}
-  prefix=$prefix\_$rep
 
   times_file="$prefix.res"
   outputs_suffix="$prefix.out"
@@ -79,7 +76,8 @@ nlp_pash(){
   mkdir -p "$pash_logs_dir"
 
   touch "$times_file"
-  echo executing Unix-for-nlp with pash $(date) | tee -a "$times_file"
+  cat "$times_file" > "$times_file".d
+  echo executing Unix-for-nlp with $prefix pash $(date) | tee "$times_file"
   echo '' >> "$times_file"
 
   for name_script in ${names_scripts[@]}
@@ -99,14 +97,14 @@ nlp_pash(){
     { time "$PASH_TOP/pa.sh" $flags --log_file "${pash_log}" ${script}.sh > "$outputs_file"; } 2> "${single_time_file}"
     cat "${single_time_file}" | tee -a "$times_file"
   done
-  cd ..
 }
 
-# bash_nlp "rep1"
-bash_nlp "rep3"
+bash_nlp
 
-# nlp_pash "$PASH_FLAGS" "par" "rep1"
-nlp_pash "$PASH_FLAGS --parallel_pipelines_limit 8" "par" "rep3"
+nlp_pash "$PASH_FLAGS" "par_no_du"
 
-# nlp_pash "$PASH_FLAGS --distributed_exec" "distr" "rep1"
-nlp_pash "$PASH_FLAGS --distributed_exec --parallel_pipelines_limit 24" "distr" "rep3"
+nlp_pash "$PASH_FLAGS --parallel_pipelines --parallel_pipelines_limit 24" "par"
+
+nlp_pash "$PASH_FLAGS --distributed_exec" "distr_no_du"
+
+nlp_pash "$PASH_FLAGS --parallel_pipelines --distributed_exec --parallel_pipelines_limit 24" "distr"
diff --git a/evaluation/distr_benchmarks/oneliners/run.distr.sh b/evaluation/distr_benchmarks/oneliners/run.distr.sh
index 84913531c..6eeaf36ac 100755
--- a/evaluation/distr_benchmarks/oneliners/run.distr.sh
+++ b/evaluation/distr_benchmarks/oneliners/run.distr.sh
@@ -27,8 +27,8 @@ oneliners_bash() {
 
     touch "$seq_times_file"
     cat $seq_times_file > $seq_times_file.d
-    echo executing one-liners $(date) | tee -a "$seq_times_file"
-    echo '' > "$seq_times_file"
+    echo executing one-liners $(date) | tee "$seq_times_file"
+    echo '' >> "$seq_times_file"
 
     for script_input in ${scripts_inputs[@]}
     do
@@ -66,8 +66,8 @@ oneliners_pash(){
 
   touch "$times_file"
   cat $times_file > $times_file.d
-  echo executing one-liners with $prefix pash with data $rep $(date) | tee -a "$times_file"
-  echo '' > "$times_file"
+  echo executing one-liners with $prefix pash with data $rep $(date) | tee "$times_file"
+  echo '' >> "$times_file"
 
   for script_input in ${scripts_inputs[@]}
   do
@@ -92,11 +92,11 @@ oneliners_pash(){
   done
 }
 
-oneliners_bash "rep1"
+# oneliners_bash "rep1"
 oneliners_bash "rep3"
 
-oneliners_pash "$PASH_FLAGS" "par" "rep1"
+# oneliners_pash "$PASH_FLAGS" "par" "rep1"
 oneliners_pash "$PASH_FLAGS" "par" "rep3"
 
-oneliners_pash "$PASH_FLAGS --distributed_exec" "distr" "rep1"
+# oneliners_pash "$PASH_FLAGS --distributed_exec" "distr" "rep1"
 oneliners_pash "$PASH_FLAGS --distributed_exec" "distr" "rep3"
diff --git a/evaluation/distr_benchmarks/run_all.sh b/evaluation/distr_benchmarks/run_all.sh
new file mode 100755
index 000000000..9162bd352
--- /dev/null
+++ b/evaluation/distr_benchmarks/run_all.sh
@@ -0,0 +1,5 @@
+cd $PASH_TOP/evaluation/distr_benchmarks/oneliners
+bash run.distr.sh
+
+cd $PASH_TOP/evaluation/distr_benchmarks/nlp
+bash run.distr.sh
\ No newline at end of file

From 747527d8aff54138e4106b02b0ea7b124d2df20d Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Thu, 2 Jun 2022 09:55:34 -0400
Subject: [PATCH 15/37] added gitignores to outputs and inputs

---
 evaluation/distr_benchmarks/.gitignore                       | 1 +
 evaluation/distr_benchmarks/dependency_untangling/.gitignore | 3 +++
 evaluation/distr_benchmarks/nlp/.gitignore                   | 1 +
 3 files changed, 5 insertions(+)
 create mode 100644 evaluation/distr_benchmarks/.gitignore
 create mode 100644 evaluation/distr_benchmarks/dependency_untangling/.gitignore

diff --git a/evaluation/distr_benchmarks/.gitignore b/evaluation/distr_benchmarks/.gitignore
new file mode 100644
index 000000000..e6d35e74c
--- /dev/null
+++ b/evaluation/distr_benchmarks/.gitignore
@@ -0,0 +1 @@
+outputs
\ No newline at end of file
diff --git a/evaluation/distr_benchmarks/dependency_untangling/.gitignore b/evaluation/distr_benchmarks/dependency_untangling/.gitignore
new file mode 100644
index 000000000..46428b369
--- /dev/null
+++ b/evaluation/distr_benchmarks/dependency_untangling/.gitignore
@@ -0,0 +1,3 @@
+input/*
+!input/install-deps.sh
+!setup.sh
\ No newline at end of file
diff --git a/evaluation/distr_benchmarks/nlp/.gitignore b/evaluation/distr_benchmarks/nlp/.gitignore
index 1dd206e6f..6e99c238a 100644
--- a/evaluation/distr_benchmarks/nlp/.gitignore
+++ b/evaluation/distr_benchmarks/nlp/.gitignore
@@ -1,2 +1,3 @@
 exodus
 genesis
+outputs
\ No newline at end of file

From 18f0a086b6d12a4488c3fab4b02c628e58e467a6 Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Thu, 2 Jun 2022 11:26:44 -0400
Subject: [PATCH 16/37] use temp ffiles n pure functions

---
 evaluation/distr_benchmarks/nlp/4_3.sh   | 16 +++++++++++++---
 evaluation/distr_benchmarks/nlp/4_3b.sh  | 10 ++++++----
 evaluation/distr_benchmarks/nlp/6_1.sh   | 12 +++++++-----
 evaluation/distr_benchmarks/nlp/8.2_2.sh | 10 ++++++----
 evaluation/distr_benchmarks/nlp/8.3_2.sh |  8 +++++---
 evaluation/distr_benchmarks/nlp/8.3_3.sh | 12 +++++++-----
 evaluation/distr_benchmarks/nlp/8_1.sh   |  8 +++++---
 7 files changed, 49 insertions(+), 27 deletions(-)

diff --git a/evaluation/distr_benchmarks/nlp/4_3.sh b/evaluation/distr_benchmarks/nlp/4_3.sh
index 100c78918..e493172ae 100755
--- a/evaluation/distr_benchmarks/nlp/4_3.sh
+++ b/evaluation/distr_benchmarks/nlp/4_3.sh
@@ -7,12 +7,22 @@ IN=${IN:-/nlp/pg/}
 OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/4_3/}
 ENTRIES=${ENTRIES:-1060}
 mkdir -p "$OUT"
+echo $ENTRIES
+
+pure_func() {
+    input=$1
+    TEMPDIR=$(mktemp -d)
+    cat > ${TEMPDIR}/${input}.input.words
+    tail +2 ${TEMPDIR}/${input}.input.words > ${TEMPDIR}/${input}.input.nextwords
+    paste ${TEMPDIR}/${input}.input.words ${TEMPDIR}/${input}.input.nextwords
+
+    rm -rf ${TEMPDIR}
+}
+export -f pure_func
 
 for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
 do
-    hdfs dfs -cat $IN/$input |  tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" > ${OUT}/${input}.input.words
-    tail +2  ${OUT}/${input}.input.words > ${OUT}/${input}.input.nextwords
-    paste ${OUT}/${input}.input.words ${OUT}/${input}.input.nextwords | sort | uniq -c > ${OUT}/${input}.input.bigrams
+    hdfs dfs -cat $IN/$input |  tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$"| pure_func $input| sort | uniq -c > ${OUT}/${input}.input.bigrams
 done
 
 echo 'done';
diff --git a/evaluation/distr_benchmarks/nlp/4_3b.sh b/evaluation/distr_benchmarks/nlp/4_3b.sh
index a77f9dd26..ce9f5b7eb 100755
--- a/evaluation/distr_benchmarks/nlp/4_3b.sh
+++ b/evaluation/distr_benchmarks/nlp/4_3b.sh
@@ -9,11 +9,13 @@ mkdir -p "$OUT"
 
 pure_func() {
     input=$1
-    cat > ${OUT}/${input}.words
-    tail +2 ${OUT}/${input}.words > ${OUT}/${input}.nextwords
-    tail +2 ${OUT}/${input}.words > ${OUT}/${input}.nextwords2
-    paste ${OUT}/${input}.words ${OUT}/${input}.nextwords ${OUT}/${input}.nextwords2 |
+    TEMPDIR=$(mktemp -d)
+    cat > ${TEMPDIR}/${input}.words
+    tail +2 ${TEMPDIR}/${input}.words > ${TEMPDIR}/${input}.nextwords
+    tail +2 ${TEMPDIR}/${input}.words > ${TEMPDIR}/${input}.nextwords2
+    paste ${TEMPDIR}/${input}.words ${TEMPDIR}/${input}.nextwords ${TEMPDIR}/${input}.nextwords2 |
     sort | uniq -c 
+    rm -rf ${TEMPDIR}
 }
 export -f pure_func
 for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
diff --git a/evaluation/distr_benchmarks/nlp/6_1.sh b/evaluation/distr_benchmarks/nlp/6_1.sh
index 39c328c20..b0836db6e 100755
--- a/evaluation/distr_benchmarks/nlp/6_1.sh
+++ b/evaluation/distr_benchmarks/nlp/6_1.sh
@@ -9,11 +9,13 @@ mkdir -p "$OUT"
 
 trigrams() {
     input=$1
-    tr -sc '[A-Z][a-z]' '[\012*]' > ${OUT}/${input}.words
-    tail +2 ${OUT}/${input}.words > ${OUT}/${input}.nextwords
-    tail +3 ${OUT}/${input}.words > ${OUT}/${input}.nextwords2
-    paste ${OUT}/${input}.words ${OUT}/${input}.nextwords ${OUT}/${input}.nextwords2 | sort | uniq -c
-    rm -f ${OUT}/${input}.words ${OUT}/${input}.nextwords ${OUT}/${input}.nextwords2
+    TEMPDIR=$(mktemp -d)
+    tr -sc '[A-Z][a-z]' '[\012*]' > ${TEMPDIR}/${input}.words
+    tail +2 ${TEMPDIR}/${input}.words > ${TEMPDIR}/${input}.nextwords
+    tail +3 ${TEMPDIR}/${input}.words > ${TEMPDIR}/${input}.nextwords2
+    paste ${TEMPDIR}/${input}.words ${TEMPDIR}/${input}.nextwords ${TEMPDIR}/${input}.nextwords2 | sort | uniq -c
+    rm -f ${TEMPDIR}/${input}.words ${TEMPDIR}/${input}.nextwords ${TEMPDIR}/${input}.nextwords2
+    rm -rf ${TEMPDIR}
 }
 export -f trigrams
 
diff --git a/evaluation/distr_benchmarks/nlp/8.2_2.sh b/evaluation/distr_benchmarks/nlp/8.2_2.sh
index dc6ec685b..be50f92be 100755
--- a/evaluation/distr_benchmarks/nlp/8.2_2.sh
+++ b/evaluation/distr_benchmarks/nlp/8.2_2.sh
@@ -10,10 +10,12 @@ mkdir -p "$OUT"
 
 pure_func() {
     input=$1
-    cat > ${OUT}/${input}.input.words
-    tail +2 ${OUT}/${input}.input.words > ${OUT}/${input}.input.nextwords
-    paste ${OUT}/${input}.input.words ${OUT}/${input}.input.nextwords | sort | uniq -c > ${OUT}/${input}.input.bigrams
-    awk "\$1 == 2 {print \$2, \$3}" ${OUT}/${input}.input.bigrams
+    TEMPDIR=$(mktemp -d)
+    cat > ${TEMPDIR}/${input}.input.words
+    tail +2 ${TEMPDIR}/${input}.input.words > ${TEMPDIR}/${input}.input.nextwords
+    paste ${TEMPDIR}/${input}.input.words ${TEMPDIR}/${input}.input.nextwords | sort | uniq -c > ${TEMPDIR}/${input}.input.bigrams
+    awk "\$1 == 2 {print \$2, \$3}" ${TEMPDIR}/${input}.input.bigrams
+    rm -rf {TEMPDIR}
 }
 
 export -f pure_func
diff --git a/evaluation/distr_benchmarks/nlp/8.3_2.sh b/evaluation/distr_benchmarks/nlp/8.3_2.sh
index 47454d3b8..59265d767 100755
--- a/evaluation/distr_benchmarks/nlp/8.3_2.sh
+++ b/evaluation/distr_benchmarks/nlp/8.3_2.sh
@@ -9,9 +9,11 @@ mkdir -p "$OUT"
 
 pure_func() {
     input=$1
-    sort -u > ${OUT}/${input}.types
-    rev < ${OUT}/${input}.types > ${OUT}/${input}.types.rev
-    sort ${OUT}/${input}.types ${OUT}/${input}.types.rev | uniq -c | awk "\$1 >= 2 {print \$2}"
+    TEMPDIR=$(mktemp -d)
+    sort -u > ${TEMPDIR}/${input}.types
+    rev < ${TEMPDIR}/${input}.types > ${TEMPDIR}/${input}.types.rev
+    sort ${TEMPDIR}/${input}.types ${TEMPDIR}/${input}.types.rev | uniq -c | awk "\$1 >= 2 {print \$2}"
+    rm -rf ${TEMPDIR}
 }
 
 export -f pure_func
diff --git a/evaluation/distr_benchmarks/nlp/8.3_3.sh b/evaluation/distr_benchmarks/nlp/8.3_3.sh
index 71b873a21..18af5aef6 100755
--- a/evaluation/distr_benchmarks/nlp/8.3_3.sh
+++ b/evaluation/distr_benchmarks/nlp/8.3_3.sh
@@ -5,17 +5,19 @@
 IN=${IN:-/nlp/pg/}
 INPUT2=${INPUT2:-/nlp/exodus}
 OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/8.3_3/}
-ENTRIES=${ENTRIES:-1060}
+ENTRIES=${ENTRIES:-1}
 mkdir -p $OUT
 
 pure_func() {
     input=$1
-    cat > ${OUT}/${input}1.types
-    hdfs dfs -cat  ${INPUT2} | tr -sc '[A-Z][a-z]' '[\012*]' | sort -u > ${OUT}/${input}2.types
-    sort $OUT/${input}1.types ${OUT}/${input}2.types ${OUT}/${input}2.types | uniq -c | head 
-
+    TEMPDIR=$(mktemp -d)
+    cat > ${TEMPDIR}/${input}1.types
+    hdfs dfs -cat  ${INPUT2} | tr -sc '[A-Z][a-z]' '[\012*]' | sort -u > ${TEMPDIR}/${input}2.types
+    sort ${TEMPDIR}/${input}1.types ${TEMPDIR}/${input}2.types ${TEMPDIR}/${input}2.types | uniq -c | head 
+    rm -rf ${TEMPDIR}
 }
 export -f pure_func
+
 for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
 do
     hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | sort -u | pure_func $input > ${OUT}/${input}.out
diff --git a/evaluation/distr_benchmarks/nlp/8_1.sh b/evaluation/distr_benchmarks/nlp/8_1.sh
index 7973476ba..07a27ed22 100755
--- a/evaluation/distr_benchmarks/nlp/8_1.sh
+++ b/evaluation/distr_benchmarks/nlp/8_1.sh
@@ -9,9 +9,11 @@ mkdir -p "$OUT"
 
 pure_func() {
     input=$1
-    cat > ${OUT}/${input}.words
-    tr -sc '[AEIOUaeiou\012]' ' ' < ${OUT}/${input}.words | awk '{print NF}' > ${OUT}/${input}.syl
-    paste ${OUT}/${input}.syl ${OUT}/${input}.words | sort -nr | sed 5q
+    TEMPDIR=$(mktemp -d)
+    cat > ${TEMPDIR}/${input}.words
+    tr -sc '[AEIOUaeiou\012]' ' ' < ${TEMPDIR}/${input}.words | awk '{print NF}' > ${TEMPDIR}/${input}.syl
+    paste ${TEMPDIR}/${input}.syl ${TEMPDIR}/${input}.words | sort -nr | sed 5q
+    rm -rf ${TEMPDIR}
 }
 export -f pure_func
 for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)

From fd068837df31c5440f72f65eddcfd166e6a9fc8f Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Thu, 2 Jun 2022 11:36:10 -0400
Subject: [PATCH 17/37] minor nlp fixes

---
 evaluation/distr_benchmarks/nlp/4_3.sh   | 1 -
 evaluation/distr_benchmarks/nlp/6_1.sh   | 1 -
 evaluation/distr_benchmarks/nlp/7_1.sh   | 2 +-
 evaluation/distr_benchmarks/nlp/8.3_3.sh | 2 +-
 4 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/evaluation/distr_benchmarks/nlp/4_3.sh b/evaluation/distr_benchmarks/nlp/4_3.sh
index e493172ae..e817e36b8 100755
--- a/evaluation/distr_benchmarks/nlp/4_3.sh
+++ b/evaluation/distr_benchmarks/nlp/4_3.sh
@@ -15,7 +15,6 @@ pure_func() {
     cat > ${TEMPDIR}/${input}.input.words
     tail +2 ${TEMPDIR}/${input}.input.words > ${TEMPDIR}/${input}.input.nextwords
     paste ${TEMPDIR}/${input}.input.words ${TEMPDIR}/${input}.input.nextwords
-
     rm -rf ${TEMPDIR}
 }
 export -f pure_func
diff --git a/evaluation/distr_benchmarks/nlp/6_1.sh b/evaluation/distr_benchmarks/nlp/6_1.sh
index b0836db6e..5b4181fb2 100755
--- a/evaluation/distr_benchmarks/nlp/6_1.sh
+++ b/evaluation/distr_benchmarks/nlp/6_1.sh
@@ -14,7 +14,6 @@ trigrams() {
     tail +2 ${TEMPDIR}/${input}.words > ${TEMPDIR}/${input}.nextwords
     tail +3 ${TEMPDIR}/${input}.words > ${TEMPDIR}/${input}.nextwords2
     paste ${TEMPDIR}/${input}.words ${TEMPDIR}/${input}.nextwords ${TEMPDIR}/${input}.nextwords2 | sort | uniq -c
-    rm -f ${TEMPDIR}/${input}.words ${TEMPDIR}/${input}.nextwords ${TEMPDIR}/${input}.nextwords2
     rm -rf ${TEMPDIR}
 }
 export -f trigrams
diff --git a/evaluation/distr_benchmarks/nlp/7_1.sh b/evaluation/distr_benchmarks/nlp/7_1.sh
index 7f3f81518..c78172cb2 100755
--- a/evaluation/distr_benchmarks/nlp/7_1.sh
+++ b/evaluation/distr_benchmarks/nlp/7_1.sh
@@ -13,4 +13,4 @@ do
 done
 
 echo 'done';
-rm ${OUT}
+rm -rf ${OUT}
diff --git a/evaluation/distr_benchmarks/nlp/8.3_3.sh b/evaluation/distr_benchmarks/nlp/8.3_3.sh
index 18af5aef6..937522b3f 100755
--- a/evaluation/distr_benchmarks/nlp/8.3_3.sh
+++ b/evaluation/distr_benchmarks/nlp/8.3_3.sh
@@ -5,7 +5,7 @@
 IN=${IN:-/nlp/pg/}
 INPUT2=${INPUT2:-/nlp/exodus}
 OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/8.3_3/}
-ENTRIES=${ENTRIES:-1}
+ENTRIES=${ENTRIES:-1060}
 mkdir -p $OUT
 
 pure_func() {

From 051df829eb4ef34e71943d31ab8f61d6da09b309 Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Thu, 2 Jun 2022 23:56:26 -0600
Subject: [PATCH 18/37] ported unix50 for distributed exec

---
 evaluation/distr_benchmarks/unix50/.gitignore |   3 +
 evaluation/distr_benchmarks/unix50/1.sh       |   6 +
 evaluation/distr_benchmarks/unix50/10.sh      |   6 +
 evaluation/distr_benchmarks/unix50/11.sh      |   6 +
 evaluation/distr_benchmarks/unix50/12.sh      |   6 +
 evaluation/distr_benchmarks/unix50/13.sh      |   6 +
 evaluation/distr_benchmarks/unix50/14.sh      |   6 +
 evaluation/distr_benchmarks/unix50/15.sh      |   6 +
 evaluation/distr_benchmarks/unix50/16.sh      |   6 +
 evaluation/distr_benchmarks/unix50/17.sh      |   6 +
 evaluation/distr_benchmarks/unix50/18.sh      |   6 +
 evaluation/distr_benchmarks/unix50/19.sh      |   6 +
 evaluation/distr_benchmarks/unix50/2.sh       |   6 +
 evaluation/distr_benchmarks/unix50/20.sh      |   6 +
 evaluation/distr_benchmarks/unix50/21.sh      |   6 +
 evaluation/distr_benchmarks/unix50/22.sh      |   6 +
 evaluation/distr_benchmarks/unix50/23.sh      |   6 +
 evaluation/distr_benchmarks/unix50/24.sh      |   6 +
 evaluation/distr_benchmarks/unix50/25.sh      |   6 +
 evaluation/distr_benchmarks/unix50/26.sh      |   6 +
 evaluation/distr_benchmarks/unix50/27.sh      |   6 +
 evaluation/distr_benchmarks/unix50/28.sh      |   6 +
 evaluation/distr_benchmarks/unix50/29.sh      |   6 +
 evaluation/distr_benchmarks/unix50/3.sh       |   6 +
 evaluation/distr_benchmarks/unix50/30.sh      |   6 +
 evaluation/distr_benchmarks/unix50/31.sh      |   6 +
 evaluation/distr_benchmarks/unix50/32.sh      |   6 +
 evaluation/distr_benchmarks/unix50/33.sh      |   6 +
 evaluation/distr_benchmarks/unix50/34.sh      |   6 +
 evaluation/distr_benchmarks/unix50/35.sh      |   6 +
 evaluation/distr_benchmarks/unix50/36.sh      |   5 +
 evaluation/distr_benchmarks/unix50/4.sh       |   6 +
 evaluation/distr_benchmarks/unix50/5.sh       |   6 +
 evaluation/distr_benchmarks/unix50/6.sh       |   6 +
 evaluation/distr_benchmarks/unix50/7.sh       |   6 +
 evaluation/distr_benchmarks/unix50/8.sh       |   6 +
 evaluation/distr_benchmarks/unix50/9.sh       |   6 +
 .../distr_benchmarks/unix50/input/setup.sh    |  69 ++++++++
 .../unix50/input/split-unix50.sh              |  17 ++
 .../distr_benchmarks/unix50/input/unix50.sh   | 151 ++++++++++++++++++
 .../distr_benchmarks/unix50/run.distr.sh      |  78 +++++++++
 41 files changed, 533 insertions(+)
 create mode 100644 evaluation/distr_benchmarks/unix50/.gitignore
 create mode 100755 evaluation/distr_benchmarks/unix50/1.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/10.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/11.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/12.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/13.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/14.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/15.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/16.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/17.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/18.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/19.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/2.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/20.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/21.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/22.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/23.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/24.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/25.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/26.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/27.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/28.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/29.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/3.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/30.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/31.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/32.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/33.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/34.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/35.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/36.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/4.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/5.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/6.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/7.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/8.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/9.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/input/setup.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/input/split-unix50.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/input/unix50.sh
 create mode 100755 evaluation/distr_benchmarks/unix50/run.distr.sh

diff --git a/evaluation/distr_benchmarks/unix50/.gitignore b/evaluation/distr_benchmarks/unix50/.gitignore
new file mode 100644
index 000000000..30547eafd
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/.gitignore
@@ -0,0 +1,3 @@
+inputs/*
+intermediary/*
+*.txt
diff --git a/evaluation/distr_benchmarks/unix50/1.sh b/evaluation/distr_benchmarks/unix50/1.sh
new file mode 100755
index 000000000..9684112ce
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/1.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN1=$IN_PRE/1.txt
+# 1.0: extract the last name
+hdfs dfs -cat $IN1 | cut -d ' ' -f 2
+
diff --git a/evaluation/distr_benchmarks/unix50/10.sh b/evaluation/distr_benchmarks/unix50/10.sh
new file mode 100755
index 000000000..3ef1eef49
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/10.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN4=$IN_PRE/4.txt
+# 4.4: histogram of Belle's captures (-pawns) by each type of piece
+hdfs dfs -cat $IN4 | tr ' ' '\n' | grep 'x' | grep '\.' | cut -d '.' -f 2 | grep '[KQRBN]' | cut -c 1-1 | sort | uniq -c | sort -nr
+
diff --git a/evaluation/distr_benchmarks/unix50/11.sh b/evaluation/distr_benchmarks/unix50/11.sh
new file mode 100755
index 000000000..0bee1dba1
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/11.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN4=$IN_PRE/4.txt
+# 4.5: 4.4 + pawns
+hdfs dfs -cat $IN4 | tr ' ' '\n' | grep 'x' | grep '\.' | cut -d '.' -f 2 | cut -c 1-1 | tr '[a-z]' 'P' | sort | uniq -c | sort -nr
+
diff --git a/evaluation/distr_benchmarks/unix50/12.sh b/evaluation/distr_benchmarks/unix50/12.sh
new file mode 100755
index 000000000..5bf77c806
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/12.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN4=$IN_PRE/4.txt
+# 4.6: piece used the most by Belle
+hdfs dfs -cat $IN4 | tr ' ' '\n' | grep '\.' | cut -d '.' -f 2 | cut -c 1-1 | tr '[a-z]' 'P' | sort -r | uniq | head -n 3 | tail -n 1
+
diff --git a/evaluation/distr_benchmarks/unix50/13.sh b/evaluation/distr_benchmarks/unix50/13.sh
new file mode 100755
index 000000000..9702a3861
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/13.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN5=$IN_PRE/5.txt
+# 5.1: extract hello world
+hdfs dfs -cat $IN5 | grep 'print' | cut -d "\"" -f 2 | cut -c 1-12
+
diff --git a/evaluation/distr_benchmarks/unix50/14.sh b/evaluation/distr_benchmarks/unix50/14.sh
new file mode 100755
index 000000000..f67e4d81a
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/14.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN6=$IN_PRE/6.txt
+# 6.1: order the bodies by how easy it would be to land on them in Thompson's Space Travel game when playing at the highest simulation scale
+hdfs dfs -cat $IN6 | awk "{print \$2, \$0}" | sort -nr | cut -d ' ' -f 2
+
diff --git a/evaluation/distr_benchmarks/unix50/15.sh b/evaluation/distr_benchmarks/unix50/15.sh
new file mode 100755
index 000000000..abe5c620c
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/15.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN7=$IN_PRE/7.txt
+# 7.1: identify number of AT&T unix versions
+hdfs dfs -cat $IN7 | cut -f 1 | grep 'AT&T' | wc -l
+
diff --git a/evaluation/distr_benchmarks/unix50/16.sh b/evaluation/distr_benchmarks/unix50/16.sh
new file mode 100755
index 000000000..7b95222b5
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/16.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN7=$IN_PRE/7.txt
+# 7.2: find  most frequently occurring machine
+hdfs dfs -cat $IN7 | cut -f 2 | sort -n | uniq -c | sort -nr | head -n 1 | tr -s ' ' '\n' | tail -n 1
+
diff --git a/evaluation/distr_benchmarks/unix50/17.sh b/evaluation/distr_benchmarks/unix50/17.sh
new file mode 100755
index 000000000..3a0246204
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/17.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN7=$IN_PRE/7.txt
+# 7.3: all the decades in which a unix version was released
+hdfs dfs -cat $IN7 | cut -f 4 | sort -n | cut -c 3-3 | uniq | sed s/\$/'0s'/
+
diff --git a/evaluation/distr_benchmarks/unix50/18.sh b/evaluation/distr_benchmarks/unix50/18.sh
new file mode 100755
index 000000000..c90d3ff9b
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/18.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN8=$IN_PRE/8.txt
+# 8.1: count unix birth-year
+hdfs dfs -cat $IN8 | tr ' ' '\n' | grep 1969 | wc -l
+
diff --git a/evaluation/distr_benchmarks/unix50/19.sh b/evaluation/distr_benchmarks/unix50/19.sh
new file mode 100755
index 000000000..5eaceae8d
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/19.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN8=$IN_PRE/8.txt
+# 8.2: find Bell Labs location where Dennis Ritchie had his office
+hdfs dfs -cat $IN8 | grep 'Bell' | awk 'length <= 45' | cut -d ',' -f 2 | awk "{\$1=\$1};1"
+
diff --git a/evaluation/distr_benchmarks/unix50/2.sh b/evaluation/distr_benchmarks/unix50/2.sh
new file mode 100755
index 000000000..82eb4b460
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/2.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN1=$IN_PRE/1.txt
+# 1.1: extract names and sort
+hdfs dfs -cat $IN1 | cut -d ' ' -f 2 | sort
+
diff --git a/evaluation/distr_benchmarks/unix50/20.sh b/evaluation/distr_benchmarks/unix50/20.sh
new file mode 100755
index 000000000..3d121b839
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/20.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN8=$IN_PRE/8.txt
+# 8.3: find names of the four people most involved with unix
+hdfs dfs -cat $IN8 | grep '(' | cut -d '(' -f 2 | cut -d ')' -f 1 | head -n 1
+
diff --git a/evaluation/distr_benchmarks/unix50/21.sh b/evaluation/distr_benchmarks/unix50/21.sh
new file mode 100755
index 000000000..9578ad223
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/21.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN8=$IN_PRE/8.txt
+# 8.4: find longest words without hyphens
+hdfs dfs -cat $IN8 | tr -c "[a-z][A-Z]" '\n' | sort | awk "length >= 16"
+
diff --git a/evaluation/distr_benchmarks/unix50/22.sh b/evaluation/distr_benchmarks/unix50/22.sh
new file mode 100755
index 000000000..44ccb21f6
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/22.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN8=$IN_PRE/8.txt
+# # 8.5: Find second-most-freq 8-character word(s) without hyphens
+# cat $IN8 > /dev/null
+
diff --git a/evaluation/distr_benchmarks/unix50/23.sh b/evaluation/distr_benchmarks/unix50/23.sh
new file mode 100755
index 000000000..76cf0f938
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/23.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN91=$IN_PRE/9.1.txt
+# 9.1: extract the word PORT
+hdfs dfs -cat $IN91 | tr ' ' '\n' | grep '[A-Z]' | tr '[a-z]' '\n' | grep '[A-Z]' | tr -d '\n' | cut -c 1-4
+
diff --git a/evaluation/distr_benchmarks/unix50/24.sh b/evaluation/distr_benchmarks/unix50/24.sh
new file mode 100755
index 000000000..9ff9bf821
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/24.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN92=$IN_PRE/9.2.txt
+# 9.2: extract the word BELL
+hdfs dfs -cat $IN92 | cut -c 1-1 | tr -d '\n'
+
diff --git a/evaluation/distr_benchmarks/unix50/25.sh b/evaluation/distr_benchmarks/unix50/25.sh
new file mode 100755
index 000000000..b8f983ec0
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/25.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN93=$IN_PRE/9.3.txt
+# 9.3: animal that used to decorate the Unix room
+hdfs dfs -cat $IN93 | cut -c 1-2 | tr -d '\n'
+
diff --git a/evaluation/distr_benchmarks/unix50/26.sh b/evaluation/distr_benchmarks/unix50/26.sh
new file mode 100755
index 000000000..aae9b34e4
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/26.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN94=$IN_PRE/9.4.txt
+# 9.4: four corners with E centered, for an "X" configuration
+hdfs dfs -cat $IN94 | tr ' ' '\n' | grep "\"" | sed 4d | cut -d "\"" -f 2 | tr -d '\n'
+
diff --git a/evaluation/distr_benchmarks/unix50/27.sh b/evaluation/distr_benchmarks/unix50/27.sh
new file mode 100755
index 000000000..41b1e5577
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/27.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN95=$IN_PRE/9.5.txt
+# # 9.5: backwards running clock, in a backwards poem
+# cat $IN95 > /dev/null
+
diff --git a/evaluation/distr_benchmarks/unix50/28.sh b/evaluation/distr_benchmarks/unix50/28.sh
new file mode 100755
index 000000000..3f44d5cd3
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/28.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN96=$IN_PRE/9.6.txt
+# 9.6: Follow the directions for grep
+hdfs dfs -cat $IN96 | tr ' ' '\n' | grep '[A-Z]' | sed 1d | sed 3d | sed 3d | tr '[a-z]' '\n' | grep '[A-Z]' | sed 3d | tr -c '[A-Z]' '\n' | tr -d '\n'
+
diff --git a/evaluation/distr_benchmarks/unix50/29.sh b/evaluation/distr_benchmarks/unix50/29.sh
new file mode 100755
index 000000000..bb41389a0
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/29.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN97=$IN_PRE/9.7.txt
+# 9.7: Four corners
+hdfs dfs -cat $IN97 | sed 2d | sed 2d | tr -c '[A-Z]' '\n' | tr -d '\n'
+
diff --git a/evaluation/distr_benchmarks/unix50/3.sh b/evaluation/distr_benchmarks/unix50/3.sh
new file mode 100755
index 000000000..0519768a6
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/3.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN1=$IN_PRE/1.txt
+# 1.2: extract names and sort
+hdfs dfs -cat $IN1 | head -n 2 | cut -d ' ' -f 2
+
diff --git a/evaluation/distr_benchmarks/unix50/30.sh b/evaluation/distr_benchmarks/unix50/30.sh
new file mode 100755
index 000000000..bb13b5d36
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/30.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN98=$IN_PRE/9.8.txt
+# 9.8: TELE-communications
+hdfs dfs -cat $IN98 | tr -c '[a-z][A-Z]' '\n' | grep '[A-Z]' | sed 1d | sed 2d | sed 3d | sed 4d | tr -c '[A-Z]' '\n' | tr -d '\n'
+
diff --git a/evaluation/distr_benchmarks/unix50/31.sh b/evaluation/distr_benchmarks/unix50/31.sh
new file mode 100755
index 000000000..e9ba29c14
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/31.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN99=$IN_PRE/9.9.txt
+# 9.9:
+hdfs dfs -cat $IN99 | tr -c '[a-z][A-Z]' '\n' | grep '[A-Z]' | sed 1d | sed 1d | sed 2d | sed 3d | sed 5d | tr -c '[A-Z]' '\n' | tr -d '\n'
+
diff --git a/evaluation/distr_benchmarks/unix50/32.sh b/evaluation/distr_benchmarks/unix50/32.sh
new file mode 100755
index 000000000..a3183b7bd
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/32.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN10=$IN_PRE/10.txt
+# 10.1: count Turing award recipients while working at Bell Labs
+hdfs dfs -cat $IN10 | sed 1d | grep 'Bell' | cut -f 2 | wc -l
+
diff --git a/evaluation/distr_benchmarks/unix50/33.sh b/evaluation/distr_benchmarks/unix50/33.sh
new file mode 100755
index 000000000..d9c1675b4
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/33.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN10=$IN_PRE/10.txt
+# 10.2: list Turing award recipients while working at Bell Labs
+hdfs dfs -cat $IN10 | sed 1d | grep 'Bell' | cut -f 2
+
diff --git a/evaluation/distr_benchmarks/unix50/34.sh b/evaluation/distr_benchmarks/unix50/34.sh
new file mode 100755
index 000000000..d08551141
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/34.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN10=$IN_PRE/10.txt
+# 10.3: extract Ritchie's username
+hdfs dfs -cat $IN10 | grep 'Bell' | cut -f 2 | head -n 1 | fmt -w1 | cut -c 1-1 | tr -d '\n' | tr '[A-Z]' '[a-z]'
+
diff --git a/evaluation/distr_benchmarks/unix50/35.sh b/evaluation/distr_benchmarks/unix50/35.sh
new file mode 100755
index 000000000..78436d485
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/35.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN11=$IN_PRE/11.txt
+# 11.1: year Ritchie and Thompson receive the Hamming medal
+hdfs dfs -cat $IN11 | grep 'UNIX' | cut -f 1
+
diff --git a/evaluation/distr_benchmarks/unix50/36.sh b/evaluation/distr_benchmarks/unix50/36.sh
new file mode 100755
index 000000000..376f8f23d
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/36.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN11=$IN_PRE/11.txt
+# 11.2: most repeated first name in the list?
+hdfs dfs -cat $IN11 | cut -f 2 | cut -d ' ' -f 1 | sort | uniq -c | sort -nr | head -n 1 | fmt -w1 | sed 1d
diff --git a/evaluation/distr_benchmarks/unix50/4.sh b/evaluation/distr_benchmarks/unix50/4.sh
new file mode 100755
index 000000000..e36bb9119
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/4.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN1=$IN_PRE/1.txt
+# 1.3: sort top first names
+hdfs dfs -cat $IN1 | cut -d ' ' -f 1 | sort | uniq -c | sort -r
+
diff --git a/evaluation/distr_benchmarks/unix50/5.sh b/evaluation/distr_benchmarks/unix50/5.sh
new file mode 100755
index 000000000..32148f8da
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/5.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN2=$IN_PRE/2.txt
+# 2.1: get all Unix utilities
+hdfs dfs -cat $IN2 | cut -d ' ' -f 4 | tr -d ','
+
diff --git a/evaluation/distr_benchmarks/unix50/6.sh b/evaluation/distr_benchmarks/unix50/6.sh
new file mode 100755
index 000000000..79147caf6
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/6.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN3=$IN_PRE/3.txt
+# 3.1: get lowercase first letter of last names (awk)
+hdfs dfs -cat $IN3 | cut -d ' ' -f 2 | cut -c 1-1 | tr -d '\n' | tr '[A-Z]' '[a-z]'
+
diff --git a/evaluation/distr_benchmarks/unix50/7.sh b/evaluation/distr_benchmarks/unix50/7.sh
new file mode 100755
index 000000000..f9d4a9908
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/7.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN4=$IN_PRE/4.txt
+# 4.1: find number of rounds
+hdfs dfs -cat $IN4 | tr ' ' '\n' | grep '\.' | wc -l
+
diff --git a/evaluation/distr_benchmarks/unix50/8.sh b/evaluation/distr_benchmarks/unix50/8.sh
new file mode 100755
index 000000000..a0bb5153d
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/8.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN4=$IN_PRE/4.txt
+# 4.2: find pieces captured by Belle
+hdfs dfs -cat $IN4 | tr ' ' '\n' | grep 'x' | grep '\.' | wc -l
+
diff --git a/evaluation/distr_benchmarks/unix50/9.sh b/evaluation/distr_benchmarks/unix50/9.sh
new file mode 100755
index 000000000..d31ba769a
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/9.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+export IN_PRE=${IN_PRE:-/unix50}
+IN4=$IN_PRE/4.txt
+# 4.3: find pieces captured by Belle with a pawn
+hdfs dfs -cat $IN4 | tr ' ' '\n' | grep 'x' | grep '\.' | cut -d '.' -f 2 | grep -v '[KQRBN]' | wc -l
+
diff --git a/evaluation/distr_benchmarks/unix50/input/setup.sh b/evaluation/distr_benchmarks/unix50/input/setup.sh
new file mode 100755
index 000000000..01d7aaa21
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/input/setup.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+#set -e
+
+PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)}
+
+## FIXME: These inputs are already 1G when downloaded
+## FIXME: Also, wget is not silent like curl in the other setup scripts.
+
+inputs=(
+1 10 11 12 2 3 4 5 6 7 8 9.1 9.2 9.3 9.4 9.5 9.6 9.7 9.8 9.9
+)
+
+if [[ "$1" == "-c" ]]; then
+    for input in ${inputs[@]}
+    do
+        rm -f "${input}.txt"
+    done
+    exit
+fi
+
+setup_dataset() {
+    # Put files in hdfs
+    hdfs dfs -mkdir /unix50
+    
+    # generate small inputs 
+    # if [ "$#" -eq 1 ] && [ "$1" = "--small" ]; then
+    #   if [ ! -d ./small ]; then                                                          
+    #     echo "Generating small-size inputs"                                             
+    #     # FIXME PR: Do we need all of them?                                             
+    #     curl -sf 'http://pac-n4.csail.mit.edu:81/pash_data/small/unix50.zip' > unix50.zip
+    #     unzip unix50.zip                                                                 
+    #     rm -f unix50.zip                                                                 
+    #   fi
+    #   hdfs dfs -put small /unix50/small                                                                              
+    #   return 0
+    # fi
+  
+    for input in ${inputs[@]}
+    do
+        if [ ! -f "${input}.txt" ]; then
+            wget "http://ndr.md/data/unix50/${input}.txt"
+            "$PASH_TOP/scripts/append_nl_if_not.sh" "${input}.txt"
+        fi
+        hdfs dfs -put $file /unix50/$file
+    done
+
+    # increase the original input size 10x
+    if [ "$#" -eq 1 ] && [ "$1" = "--extended" ]; then
+        EXTENDED_INPUT_DIR="extended_input/"
+        mkdir -p $EXTENDED_INPUT_DIR
+        for file in *.txt; do
+            rm $EXTENDED_INPUT_DIR/$file
+            for (( i = 0; i < 10; i++ )); do
+                cat $file >> $EXTENDED_INPUT_DIR/temp.txt
+            done
+        done
+        hdfs dfs -put $EXTENDED_INPUT_DIR /unix50/$EXTENDED_INPUT_DIR
+        rm -rf $EXTENDED_INPUT_DIR
+    fi
+}
+
+source_var() {
+  if [[ "$1" == "--extended" ]]; then
+    export IN_PRE=/unix50/extended_input
+  else
+    export IN_PRE=/unix50
+  fi
+}
diff --git a/evaluation/distr_benchmarks/unix50/input/split-unix50.sh b/evaluation/distr_benchmarks/unix50/input/split-unix50.sh
new file mode 100755
index 000000000..a0afe145d
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/input/split-unix50.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+awk -v RS= '{print > (NR ".txt")}' unix50.sh
+
+for file in *.txt; do
+  fname=$(basename -- "$file")
+  fscript="${fname%.*}".sh
+  echo $fscript
+  echo '#!/bin/bash' > $fscript
+
+  echo 'export IN_PRE=${IN_PRE:-$PASH_TOP/evaluation/benchmarks/unix50/input}' >> $fscript
+  input=$(grep -o 'IN..' $file)
+  grep "^$(echo $input | xargs)=" unix50.sh >> $fscript
+  cat $file >> $fscript
+  echo '' >> $fscript
+done
+
diff --git a/evaluation/distr_benchmarks/unix50/input/unix50.sh b/evaluation/distr_benchmarks/unix50/input/unix50.sh
new file mode 100755
index 000000000..7c5182bc8
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/input/unix50.sh
@@ -0,0 +1,151 @@
+#!/bin/bash
+
+# scripts from https://unixgame.io/
+# https://github.com/psinghbh/softsec.github.io
+# input files https://github.com/psinghbh/softsec.github.io/tree/master/ctf/unixgame.io/challenges
+# Which join is easier: http://www.theunixschool.com/2011/08/5-different-ways-to-join-all-lines-in.html
+# 1 (default) + 3 + 1 + 1 + 6 + 1 + 1 + 3 + 5 + 9 + 3 + 2 + 1 = 37 (there are 3 missing)
+# missing 8.5, 9.5, 12.1 
+
+if [[ -z "$IN_PRE" ]]; then
+  if [[ -z "$PASH_TOP" ]]; then
+    echo "Need to provide PASH_TOP, possibly $(git rev-parse --show-toplevel)" 1>&2
+    exit 1
+  else
+    export IN_PRE=$PASH_TOP/evaluation/benchmarks/unix50/input
+  fi
+fi
+
+IN1=$IN_PRE/1.txt
+IN2=$IN_PRE/2.txt
+IN3=$IN_PRE/3.txt
+IN4=$IN_PRE/4.txt
+IN5=$IN_PRE/5.txt
+IN6=$IN_PRE/6.txt
+IN7=$IN_PRE/7.txt
+IN8=$IN_PRE/8.txt
+IN91=$IN_PRE/9.1.txt
+IN92=$IN_PRE/9.2.txt
+IN93=$IN_PRE/9.3.txt
+IN94=$IN_PRE/9.4.txt
+IN95=$IN_PRE/9.5.txt
+IN96=$IN_PRE/9.6.txt
+IN97=$IN_PRE/9.7.txt
+IN98=$IN_PRE/9.8.txt
+IN99=$IN_PRE/9.9.txt
+IN10=$IN_PRE/10.txt
+IN11=$IN_PRE/11.txt
+IN12=$IN_PRE/12.txt
+
+# 1.0: extract the last name
+cat $IN1 | cut -d ' ' -f 2
+
+# 1.1: extract names and sort
+cat $IN1 | cut -d ' ' -f 2 | sort
+
+# 1.2: extract names and sort
+cat $IN1 | head -n 2 | cut -d ' ' -f 2
+
+# 1.3: sort top first names
+cat $IN1 | cut -d ' ' -f 1 | sort | uniq -c | sort -r
+
+# 2.1: get all Unix utilities
+cat $IN2 | cut -d ' ' -f 4 | tr -d ','
+
+# 3.1: get lowercase first letter of last names (awk)
+cat $IN3 | cut -d ' ' -f 2 | cut -c 1-1 | tr -d '\n' | tr '[A-Z]' '[a-z]'
+
+# 4.1: find number of rounds
+cat $IN4 | tr ' ' '\n' | grep '\.' | wc -l
+
+# 4.2: find pieces captured by Belle
+cat $IN4 | tr ' ' '\n' | grep 'x' | grep '\.' | wc -l
+
+# 4.3: find pieces captured by Belle with a pawn
+cat $IN4 | tr ' ' '\n' | grep 'x' | grep '\.' | cut -d '.' -f 2 | grep -v '[KQRBN]' | wc -l
+
+# 4.4: histogram of Belle's captures (-pawns) by each type of piece
+cat $IN4 | tr ' ' '\n' | grep 'x' | grep '\.' | cut -d '.' -f 2 | grep '[KQRBN]' | cut -c 1-1 | sort | uniq -c | sort -nr
+
+# 4.5: 4.4 + pawns
+cat $IN4 | tr ' ' '\n' | grep 'x' | grep '\.' | cut -d '.' -f 2 | cut -c 1-1 | tr '[a-z]' 'P' | sort | uniq -c | sort -nr
+
+# 4.6: piece used the most by Belle
+cat $IN4 | tr ' ' '\n' | grep '\.' | cut -d '.' -f 2 | cut -c 1-1 | tr '[a-z]' 'P' | sort -r | uniq | head -n 3 | tail -n 1
+
+# 5.1: extract hello world
+cat $IN5 | grep 'print' | cut -d "\"" -f 2 | cut -c 1-12
+
+# 6.1: order the bodies by how easy it would be to land on them in Thompson's Space Travel game when playing at the highest simulation scale
+cat $IN6 | awk "{print \$2, \$0}" | sort -nr | cut -d ' ' -f 2
+
+# 7.1: identify number of AT&T unix versions
+cat $IN7 | cut -f 1 | grep 'AT&T' | wc -l
+
+# 7.2: find  most frequently occurring machine
+cat $IN7 | cut -f 2 | sort -n | uniq -c | sort -nr | head -n 1 | tr -s ' ' '\n' | tail -n 1
+
+# 7.3: all the decades in which a unix version was released
+cat $IN7 | cut -f 4 | sort -n | cut -c 3-3 | uniq | sed s/\$/'0s'/
+
+# 8.1: count unix birth-year
+cat $IN8 | tr ' ' '\n' | grep 1969 | wc -l
+
+# 8.2: find Bell Labs location where Dennis Ritchie had his office
+cat $IN8 | grep 'Bell' | awk 'length <= 45' | cut -d ',' -f 2 | awk "{\$1=\$1};1"
+
+# 8.3: find names of the four people most involved with unix
+cat $IN8 | grep '(' | cut -d '(' -f 2 | cut -d ')' -f 1 | head -n 1
+
+# 8.4: find longest words without hyphens
+cat $IN8 | tr -c "[a-z][A-Z]" '\n' | sort | awk "length >= 16"
+
+# # 8.5: Find second-most-freq 8-character word(s) without hyphens
+# cat $IN8 > /dev/null
+
+# 9.1: extract the word PORT
+cat $IN91 | tr ' ' '\n' | grep '[A-Z]' | tr '[a-z]' '\n' | grep '[A-Z]' | tr -d '\n' | cut -c 1-4
+
+# 9.2: extract the word BELL
+cat $IN92 | cut -c 1-1 | tr -d '\n'
+
+# 9.3: animal that used to decorate the Unix room
+cat $IN93 | cut -c 1-2 | tr -d '\n'
+
+# 9.4: four corners with E centered, for an "X" configuration
+cat $IN94 | tr ' ' '\n' | grep "\"" | sed 4d | cut -d "\"" -f 2 | tr -d '\n'
+
+# # 9.5: backwards running clock, in a backwards poem
+# cat $IN95 > /dev/null
+
+# 9.6: Follow the directions for grep
+cat $IN96 | tr ' ' '\n' | grep '[A-Z]' | sed 1d | sed 3d | sed 3d | tr '[a-z]' '\n' | grep '[A-Z]' | sed 3d | tr -c '[A-Z]' '\n' | tr -d '\n'
+
+# 9.7: Four corners
+cat $IN97 | sed 2d | sed 2d | tr -c '[A-Z]' '\n' | tr -d '\n'
+
+# 9.8: TELE-communications
+cat $IN98 | tr -c '[a-z][A-Z]' '\n' | grep '[A-Z]' | sed 1d | sed 2d | sed 3d | sed 4d | tr -c '[A-Z]' '\n' | tr -d '\n'
+
+# 9.9:
+cat $IN99 | tr -c '[a-z][A-Z]' '\n' | grep '[A-Z]' | sed 1d | sed 1d | sed 2d | sed 3d | sed 5d | tr -c '[A-Z]' '\n' | tr -d '\n'
+
+# 10.1: count Turing award recipients while working at Bell Labs
+cat $IN10 | sed 1d | grep 'Bell' | cut -f 2 | wc -l
+
+# 10.2: list Turing award recipients while working at Bell Labs
+cat $IN10 | sed 1d | grep 'Bell' | cut -f 2
+
+# 10.3: extract Ritchie's username
+cat $IN10 | grep 'Bell' | cut -f 2 | head -n 1 | fmt -w1 | cut -c 1-1 | tr -d '\n' | tr '[A-Z]' '[a-z]'
+
+# 11.1: year Ritchie and Thompson receive the Hamming medal
+cat $IN11 | grep 'UNIX' | cut -f 1
+
+# 11.2: most repeated first name in the list?
+cat $IN11 | cut -f 2 | cut -d ' ' -f 1 | sort | uniq -c | sort -nr | head -n 1 | fmt -w1 | sed 1d
+
+
+# # 12.1: transform this list of instructions such that if the snake follows the
+# #       new instructions top to bottom, it ends on the location of the apple.
+# cat $IN12 > /dev/null
diff --git a/evaluation/distr_benchmarks/unix50/run.distr.sh b/evaluation/distr_benchmarks/unix50/run.distr.sh
new file mode 100755
index 000000000..2526bbbe4
--- /dev/null
+++ b/evaluation/distr_benchmarks/unix50/run.distr.sh
@@ -0,0 +1,78 @@
+PASH_FLAGS='--width 8 --r_split'
+export TIMEFORMAT=%R
+
+if [[ "$1" == "--extended" ]]; then
+    echo "Using extended input"
+    export IN_PRE=/unix50/extended_input
+  else
+    export IN_PRE=/unix50
+fi
+
+unix50_bash(){
+  times_file="seq.res"
+  outputs_suffix="seq.out"
+  outputs_dir="outputs"
+
+  mkdir -p "$outputs_dir"
+
+  touch "$times_file"
+  cat "$times_file" > "$times_file".d
+  echo executing Unix50 $(date) | tee "$times_file"
+  echo '' >> "$times_file"
+
+  for number in `seq 36`
+  do
+    script="${number}"
+    
+    printf -v pad %20s
+    padded_script="${script}.sh:${pad}"
+    padded_script=${padded_script:0:20}
+
+    outputs_file="${outputs_dir}/${script}.${outputs_suffix}"
+
+    echo "${padded_script}" $({ time ./${script}.sh > "$outputs_file"; } 2>&1) | tee -a "$times_file"
+  done
+}
+
+
+unix50_pash(){
+  flags=${1:-$PASH_FLAGS}
+  prefix=${2:-par}
+
+  times_file="$prefix.res"
+  outputs_suffix="$prefix.out"
+  time_suffix="$prefix.time"
+  outputs_dir="outputs"
+  pash_logs_dir="pash_logs_$prefix"
+
+  mkdir -p "$outputs_dir"
+  mkdir -p "$pash_logs_dir"
+
+  touch "$times_file"
+  cat "$times_file" > "$times_file".d
+  echo executing Unix50 $(date) | tee "$times_file"
+  echo '' >> "$times_file"
+
+  for number in `seq 36`
+  do
+    script="${number}"
+    
+    printf -v pad %20s
+    padded_script="${script}.sh:${pad}"
+    padded_script=${padded_script:0:20}
+
+    outputs_file="${outputs_dir}/${script}.${outputs_suffix}"
+    pash_log="${pash_logs_dir}/${script}.pash.log"
+    single_time_file="${outputs_dir}/${script}.${time_suffix}"
+
+    echo -n "${padded_script}" | tee -a "$times_file"
+    { time "$PASH_TOP/pa.sh" $flags --log_file "${pash_log}" ${script}.sh > "$outputs_file"; } 2> "${single_time_file}"
+    cat "${single_time_file}" | tee -a "$times_file"
+  done  
+}
+
+unix50_bash
+
+unix50_pash "$PASH_FLAGS" "par"
+
+unix50_pash "$PASH_FLAGS --distributed_exec" "distr"

From be3fae29b6dd5e460a0979b6f70951a764420219 Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Thu, 2 Jun 2022 23:58:11 -0600
Subject: [PATCH 19/37] ported analytics mts to distributed exec

---
 .../distr_benchmarks/analytics-mts/1.sh       | 21 ++++++
 .../distr_benchmarks/analytics-mts/2.sh       | 22 ++++++
 .../distr_benchmarks/analytics-mts/3.sh       | 22 ++++++
 .../distr_benchmarks/analytics-mts/4.sh       | 21 ++++++
 .../distr_benchmarks/analytics-mts/5.sh       | 18 +++++
 .../distr_benchmarks/analytics-mts/README.md  | 10 +++
 .../analytics-mts/input/.gitignore            |  5 ++
 .../analytics-mts/input/setup.sh              | 43 +++++++++++
 .../analytics-mts/run-experiment.sh           | 36 +++++++++
 .../analytics-mts/run.distr.sh                | 75 +++++++++++++++++++
 10 files changed, 273 insertions(+)
 create mode 100755 evaluation/distr_benchmarks/analytics-mts/1.sh
 create mode 100755 evaluation/distr_benchmarks/analytics-mts/2.sh
 create mode 100755 evaluation/distr_benchmarks/analytics-mts/3.sh
 create mode 100755 evaluation/distr_benchmarks/analytics-mts/4.sh
 create mode 100755 evaluation/distr_benchmarks/analytics-mts/5.sh
 create mode 100644 evaluation/distr_benchmarks/analytics-mts/README.md
 create mode 100644 evaluation/distr_benchmarks/analytics-mts/input/.gitignore
 create mode 100755 evaluation/distr_benchmarks/analytics-mts/input/setup.sh
 create mode 100755 evaluation/distr_benchmarks/analytics-mts/run-experiment.sh
 create mode 100755 evaluation/distr_benchmarks/analytics-mts/run.distr.sh

diff --git a/evaluation/distr_benchmarks/analytics-mts/1.sh b/evaluation/distr_benchmarks/analytics-mts/1.sh
new file mode 100755
index 000000000..746087898
--- /dev/null
+++ b/evaluation/distr_benchmarks/analytics-mts/1.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Vehicles on the road per day
+
+# <in.csv sed 's/T..:..:..//' |
+# awk -F, '!seen[$1 $3] {onroad[$1]++; seen[$1 $3] = 1}
+#    END { OFS = "\t"; for (d in onroad) print d, onroad[d]}' |
+# sort > out1
+
+# curl https://balab.aueb.gr/~dds/oasa-$(date --date='1 days ago' +'%y-%m-%d').bz2 | 
+#   bzip2 -d |              # decompress
+# Replace the line below with the two lines above to stream the latest file
+cat $IN |                # assumes saved input
+  sed 's/T..:..:..//' |     # hide times
+  cut -d ',' -f 1,3 |       # keep only day and bus no
+  sort -u |                 # remove duplicate records due to time
+  cut -d ',' -f 1 |         # keep all dates
+  sort |                    # preparing for uniq
+  uniq -c |                 # count unique dates
+  awk -v OFS="\t" "{print \$2,\$1}"     # print first date, then count
+
+# diff out{1,}
diff --git a/evaluation/distr_benchmarks/analytics-mts/2.sh b/evaluation/distr_benchmarks/analytics-mts/2.sh
new file mode 100755
index 000000000..9de4272f8
--- /dev/null
+++ b/evaluation/distr_benchmarks/analytics-mts/2.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# Days a vehicle is on the road
+
+# <in.csv sed 's/T..:..:..//' |
+# awk -F, '!seen[$1 $3] {onroad[$3]++; seen[$1 $3] = 1}
+#    END { OFS = "\t"; for (d in onroad) print d, onroad[d]}' |
+# sort -k2n >out1    
+
+# curl https://balab.aueb.gr/~dds/oasa-$(date --date='1 days ago' +'%y-%m-%d').bz2 | 
+#   bzip2 -d |                  # decompress
+# Replace the line below with the two lines above to stream the latest file
+cat $IN |                    # assumes saved input
+  sed 's/T..:..:..//' |         # hide times
+  cut -d ',' -f 3,1 |           # keep only day and bus ID
+  sort -u |                     # removing duplicate day-buses
+  cut -d ',' -f 2 |             # keep only bus ID
+  sort |                        # preparing for uniq
+  uniq -c |                     # count unique dates
+  sort -k1n |                   # sort in reverse numerical order
+  awk -v OFS="\t" "{print \$2,\$1}"     # print first date, then count
+
+# diff out{1,}
diff --git a/evaluation/distr_benchmarks/analytics-mts/3.sh b/evaluation/distr_benchmarks/analytics-mts/3.sh
new file mode 100755
index 000000000..d1bd67024
--- /dev/null
+++ b/evaluation/distr_benchmarks/analytics-mts/3.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# Hours each vehicle is on the road
+
+# <in.csv sed 's/T\(..\):..:../,\1/' |
+# awk -F, '!seen[$1 $2 $4] {onroad[$4]++; seen[$1 $2 $4] = 1}
+#    END { OFS = "\t"; for (d in onroad) print d, onroad[d]}' |
+# sort -k2n > out1
+
+# curl https://balab.aueb.gr/~dds/oasa-$(date --date='1 days ago' +'%y-%m-%d').bz2 | 
+#   bzip2 -d |                  # decompress
+# Replace the line below with the two lines above to stream the latest file
+cat $IN |                    # assumes saved input
+  sed 's/T\(..\):..:../,\1/' |  # keep times only
+  cut -d ',' -f 1,2,4 |         # keep only time date and bus id
+  sort -u |                     # removing duplicate entries
+  cut -d ',' -f 3 |             # keep only bus ID
+  sort |                        # preparing for uniq
+  uniq -c |                     # count hours per bus
+  sort -k1n |                   # sort in reverse numerical order
+  awk -v OFS="\t" "{print \$2,\$1}"     # print first date, then count
+
+# diff out{1,}
diff --git a/evaluation/distr_benchmarks/analytics-mts/4.sh b/evaluation/distr_benchmarks/analytics-mts/4.sh
new file mode 100755
index 000000000..e77f8efdf
--- /dev/null
+++ b/evaluation/distr_benchmarks/analytics-mts/4.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Hours monitored each day
+
+# <in.csv sed 's/T\(..\):..:../,\1/' |
+# awk -F, '!seen[$1 $2] {hours[$1]++; seen[$1 $2] = 1}
+#    END { OFS = "\t"; for (d in hours) print d, hours[d]}' | 
+#   sort
+
+# curl https://balab.aueb.gr/~dds/oasa-$(date --date='1 days ago' +'%y-%m-%d').bz2 | 
+#   bzip2 -d |                  # decompress
+# Replace the line below with the two lines above to stream the latest file
+cat $IN |                    # assumes saved input
+  sed 's/T\(..\):..:../,\1/' |  # keep times only
+  cut -d ',' -f 1,2 |           # keep only time and date
+  sort -u |                     # removing duplicate entries
+  cut -d ',' -f 1 |             # keep only date
+  sort |                        # preparing for uniq
+  uniq -c |                     # count unique dates
+  awk -v OFS="\t" "{print \$2,\$1}"     # print first date, then count
+
+# diff out{1,}
diff --git a/evaluation/distr_benchmarks/analytics-mts/5.sh b/evaluation/distr_benchmarks/analytics-mts/5.sh
new file mode 100755
index 000000000..b14675946
--- /dev/null
+++ b/evaluation/distr_benchmarks/analytics-mts/5.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# Hours each bus is active each day
+
+# Records are day, hour, line, bus
+<in.csv sed 's/T\(..\):..:../,\1/' | awk -F, '
+!seen[$1 $2 $4] { seen[$1 $2 $4] = 1; hours[$1 $4]++; bus[$4] = 1; day[$1] = 1; }
+END {
+   PROCINFO["sorted_in"] = "@ind_str_asc"
+   for (d in day)
+     printf("\t%s", d);
+   printf("\n");
+   for (b in bus) {
+     printf("%s", b);
+     for (d in day)
+       printf("\t%s", hours[d b]);
+     printf("\n");
+   }
+}' > out
diff --git a/evaluation/distr_benchmarks/analytics-mts/README.md b/evaluation/distr_benchmarks/analytics-mts/README.md
new file mode 100644
index 000000000..f6edda06c
--- /dev/null
+++ b/evaluation/distr_benchmarks/analytics-mts/README.md
@@ -0,0 +1,10 @@
+# Mass-Transport System Analytics
+
+This set of scripts script is part of [a recent study on OASA](https://insidestory.gr/article/noymera-leoforeia-athinas) from Diomidis Spinellis and Eleftheria Tsaliki.  OASA is the the mass-transport system supporting the city of Athens. 
+
+1. `1.sh`: Vehicles on the road per day
+2. `2.sh`: Days a vehicle is on the road
+3. `3.sh`: Hours each vehicle is on the road
+4. `4.sh`: Hours monitored each day
+5. `5.sh`: Hours each bus is active each day
+
diff --git a/evaluation/distr_benchmarks/analytics-mts/input/.gitignore b/evaluation/distr_benchmarks/analytics-mts/input/.gitignore
new file mode 100644
index 000000000..f264282d2
--- /dev/null
+++ b/evaluation/distr_benchmarks/analytics-mts/input/.gitignore
@@ -0,0 +1,5 @@
+./oasa-2021-01-08.bz2
+in*.csv
+./out
+./out1
+*.out
diff --git a/evaluation/distr_benchmarks/analytics-mts/input/setup.sh b/evaluation/distr_benchmarks/analytics-mts/input/setup.sh
new file mode 100755
index 000000000..f010ef168
--- /dev/null
+++ b/evaluation/distr_benchmarks/analytics-mts/input/setup.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+# #Check that we are in the appropriate directory where setup.sh is
+# #https://stackoverflow.com/a/246128
+# DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+# echo "changing to $DIR to run setup.sh"
+# cd $DIR
+
+PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)}
+
+if [[ "$1" == "-c" ]]; then
+    rm -f *.bz2 'in.csv' 'in_small.csv'
+    exit
+fi
+
+setup_dataset() {
+  hdfs dfs -mkdir /analytics-mts
+  if [ ! -f ./in.csv ] && [ "$1" = "--full" ]; then
+    # yesterday=$(date --date='1 days ago' +'%y-%m-%d')
+    # curl https://www.balab.aueb.gr/~dds/oasa-$yesterday.bz2 |
+    curl -sf 'https://www.balab.aueb.gr/~dds/oasa-2021-01-08.bz2' | bzip2 -d > in.csv
+    if [ $? -ne 0 ]; then
+      echo "oasa-2021-01-08.bz2 / bzip2 not available, contact the pash authors"
+      exit 1
+    fi
+    hdfs dfs -put in.csv  /analytics-mts/in.csv
+  elif [ ! -f ./in_small.csv ] && [ "$1" = "--small" ]; then
+    if [ ! -f ./in_small.csv ]; then                                                       
+      echo "Generating small-size inputs"                                                  
+      # FIXME PR: Do we need all of them?                                                  
+      curl -sf 'http://pac-n4.csail.mit.edu:81/pash_data/small/in_small.csv' > in_small.csv
+    fi
+    hdfs dfs -put in_small.csv  /analytics-mts/in_small.csv                                                                                     
+  fi
+}
+
+source_var() {
+  if [[ "$1" == "--small" ]]; then
+    export IN="input/in_small.csv"
+  else
+    export IN="input/in.csv"
+  fi    
+}
diff --git a/evaluation/distr_benchmarks/analytics-mts/run-experiment.sh b/evaluation/distr_benchmarks/analytics-mts/run-experiment.sh
new file mode 100755
index 000000000..ca0a0c010
--- /dev/null
+++ b/evaluation/distr_benchmarks/analytics-mts/run-experiment.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+export PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel --show-superproject-working-tree)}
+
+eval_dir="$PASH_TOP/evaluation/buses/"
+results_dir="${eval_dir}/results/"
+
+mkdir -p $results_dir
+
+for i in 1 2 3 4
+do
+    script="${eval_dir}/${i}.sh"
+    echo "Executing $script..."
+
+    seq_output=/tmp/seq_output
+    pash_width_16_no_cat_split_output=/tmp/pash_16_no_cat_split_output
+    pash_width_16_output=/tmp/pash_16_output
+
+    seq_time="${results_dir}/${i}_2_seq.time"
+    pash_width_16_no_cat_split_time="${results_dir}/${i}_16_distr_auto_split_fan_in_fan_out.time"
+    pash_width_16_time="${results_dir}/${i}_16_distr_auto_split.time"
+
+    echo "Executing the script with bash..."
+    { time /bin/bash $script > $seq_output ; } 2> >(tee "${seq_time}" >&2)
+
+    echo "Executing the script with pash -w 16 without the cat-split optimization (log in: /tmp/pash_16_log)"
+    { time $PASH_TOP/pa.sh -w 16 -d 1 --log_file /tmp/pash_16_no_cat_split_log --no_cat_split_vanish --output_time $script ; } 1> "$pash_width_16_no_cat_split_output" 2> >(tee "${pash_width_16_no_cat_split_time}" >&2)
+    echo "Checking for output equivalence..."
+    diff -s $seq_output $pash_width_16_no_cat_split_output | head
+
+    echo "Executing the script with pash -w 16 (log in: /tmp/pash_16_log)"
+    { time $PASH_TOP/pa.sh -w 16 -d 1 --log_file /tmp/pash_16_log --output_time $script ; } 1> "$pash_width_16_output" 2> >(tee "${pash_width_16_time}" >&2)
+    echo "Checking for output equivalence..."
+    diff -s $seq_output $pash_width_16_output | head
+
+done
diff --git a/evaluation/distr_benchmarks/analytics-mts/run.distr.sh b/evaluation/distr_benchmarks/analytics-mts/run.distr.sh
new file mode 100755
index 000000000..4823a1f61
--- /dev/null
+++ b/evaluation/distr_benchmarks/analytics-mts/run.distr.sh
@@ -0,0 +1,75 @@
+PASH_FLAGS='--width 8 --r_split'
+export TIMEFORMAT=%R
+
+if [[ "$1" == "--small" ]]; then
+    export IN="input/in_small.csv"
+else
+    export IN="input/in.csv"
+fi
+
+analytics-mts_bash(){
+  times_file="seq.res"
+  outputs_suffix="seq.out"
+  outputs_dir="outputs"
+
+  mkdir -p "$outputs_dir"
+
+  touch "$times_file"
+  cat "$times_file" > "$times_file".d
+  echo executing MTS analytics $(date) | tee "$times_file"
+  echo '' >> "$times_file"
+  ## FIXME 5.sh is not working yet
+  for number in `seq 4`
+  do
+    script="${number}"
+    
+    printf -v pad %20s
+    padded_script="${script}.sh:${pad}"
+    padded_script=${padded_script:0:20}
+    # select the respective input
+    outputs_file="${outputs_dir}/${script}.${outputs_suffix}"
+
+    echo "${padded_script}" $({ time ./${script}.sh > "$outputs_file"; } 2>&1) | tee -a "$times_file"
+  done
+}
+
+analytics-mts_pash(){
+  flags=${1:-$PASH_FLAGS}
+  prefix=${2:-par}
+
+  times_file="$prefix.res"
+  outputs_suffix="$prefix.out"
+  time_suffix="$prefix.time"
+  outputs_dir="outputs"
+  pash_logs_dir="pash_logs_$prefix"
+
+  mkdir -p "$outputs_dir"
+  mkdir -p "$pash_logs_dir"
+
+  touch "$times_file"
+  cat "$times_file" > "$times_file".d
+  echo executing MTS analytics with pash $(date) | tee "$times_file"
+  echo '' >> "$times_file"
+  ## FIXME 5.sh is not working yet
+  for number in `seq 4`
+  do
+    script="${number}"
+    
+    printf -v pad %20s
+    padded_script="${script}.sh:${pad}"
+    padded_script=${padded_script:0:20}
+    outputs_file="${outputs_dir}/${script}.${outputs_suffix}"
+    pash_log="${pash_logs_dir}/${script}.pash.log"
+    single_time_file="${outputs_dir}/${script}.${time_suffix}"
+
+    echo -n "${padded_script}" | tee -a "$times_file"
+    { time "$PASH_TOP/pa.sh" $flags   --log_file "${pash_log}" ${script}.sh > "$outputs_file"; } 2> "${single_time_file}"
+    cat "${single_time_file}" | tee -a "$times_file"
+  done
+}
+
+analytics-mts_bash
+
+analytics-mts_pash "$PASH_FLAGS" "par"
+
+analytics-mts_pash "$PASH_FLAGS --distributed_exec" "distr"

From 170e7cd1ea54f25628d33014a1f8ef5cb9573250 Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Fri, 3 Jun 2022 00:02:18 -0600
Subject: [PATCH 20/37] added gitingore

---
 evaluation/distr_benchmarks/.gitignore | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/evaluation/distr_benchmarks/.gitignore b/evaluation/distr_benchmarks/.gitignore
index e6d35e74c..e9dd79c07 100644
--- a/evaluation/distr_benchmarks/.gitignore
+++ b/evaluation/distr_benchmarks/.gitignore
@@ -1 +1,3 @@
-outputs
\ No newline at end of file
+outputs
+*.res.*
+*.txt
\ No newline at end of file

From 017ef5598fcd2768c9acb5c214ba6d5e6cd26d77 Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Fri, 3 Jun 2022 02:38:58 -0400
Subject: [PATCH 21/37] fix trigrams nlp

---
 evaluation/distr_benchmarks/nlp/6_1.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/evaluation/distr_benchmarks/nlp/6_1.sh b/evaluation/distr_benchmarks/nlp/6_1.sh
index 5b4181fb2..1adaca799 100755
--- a/evaluation/distr_benchmarks/nlp/6_1.sh
+++ b/evaluation/distr_benchmarks/nlp/6_1.sh
@@ -7,7 +7,7 @@ OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/6_1/}
 ENTRIES=${ENTRIES:-1060}
 mkdir -p "$OUT"
 
-trigrams() {
+pure_func() {
     input=$1
     TEMPDIR=$(mktemp -d)
     tr -sc '[A-Z][a-z]' '[\012*]' > ${TEMPDIR}/${input}.words
@@ -16,12 +16,12 @@ trigrams() {
     paste ${TEMPDIR}/${input}.words ${TEMPDIR}/${input}.nextwords ${TEMPDIR}/${input}.nextwords2 | sort | uniq -c
     rm -rf ${TEMPDIR}
 }
-export -f trigrams
+export -f pure_func
 
 for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
 do
-    hdfs dfs -cat $IN"/"$input | grep 'the land of' | trigrams ${input} | sort -nr | sed 5q > ${OUT}/${input}.out0
-    hdfs dfs -cat $IN"/"$input | grep 'And he said' | trigrams ${input} | sort -nr | sed 5q > ${OUT}/${input}.out1
+    hdfs dfs -cat $IN"/"$input | grep 'the land of' | pure_func ${input} | sort -nr | sed 5q > ${OUT}/${input}.out0
+    hdfs dfs -cat $IN"/"$input | grep 'And he said' | pure_func ${input} | sort -nr | sed 5q > ${OUT}/${input}.out1
 done
 
 echo 'done';

From 039334ba700c8080414fd7f9edf4c7f948b965ac Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Mon, 6 Jun 2022 16:08:06 -0400
Subject: [PATCH 22/37] some fixes

---
 evaluation/distr_benchmarks/analytics-mts/1.sh           | 2 +-
 evaluation/distr_benchmarks/analytics-mts/2.sh           | 2 +-
 evaluation/distr_benchmarks/analytics-mts/3.sh           | 2 +-
 evaluation/distr_benchmarks/analytics-mts/4.sh           | 2 +-
 evaluation/distr_benchmarks/analytics-mts/input/setup.sh | 6 +++---
 evaluation/distr_benchmarks/analytics-mts/run.distr.sh   | 4 ++--
 evaluation/distr_benchmarks/nlp/6_1.sh                   | 4 ++--
 evaluation/distr_benchmarks/nlp/run.distr.sh             | 4 ++--
 evaluation/distr_benchmarks/unix50/input/setup.sh        | 2 +-
 9 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/evaluation/distr_benchmarks/analytics-mts/1.sh b/evaluation/distr_benchmarks/analytics-mts/1.sh
index 746087898..1ce28dc4a 100755
--- a/evaluation/distr_benchmarks/analytics-mts/1.sh
+++ b/evaluation/distr_benchmarks/analytics-mts/1.sh
@@ -9,7 +9,7 @@
 # curl https://balab.aueb.gr/~dds/oasa-$(date --date='1 days ago' +'%y-%m-%d').bz2 | 
 #   bzip2 -d |              # decompress
 # Replace the line below with the two lines above to stream the latest file
-cat $IN |                # assumes saved input
+hdfs dfs -cat $IN |                # assumes saved input
   sed 's/T..:..:..//' |     # hide times
   cut -d ',' -f 1,3 |       # keep only day and bus no
   sort -u |                 # remove duplicate records due to time
diff --git a/evaluation/distr_benchmarks/analytics-mts/2.sh b/evaluation/distr_benchmarks/analytics-mts/2.sh
index 9de4272f8..59abd8de4 100755
--- a/evaluation/distr_benchmarks/analytics-mts/2.sh
+++ b/evaluation/distr_benchmarks/analytics-mts/2.sh
@@ -9,7 +9,7 @@
 # curl https://balab.aueb.gr/~dds/oasa-$(date --date='1 days ago' +'%y-%m-%d').bz2 | 
 #   bzip2 -d |                  # decompress
 # Replace the line below with the two lines above to stream the latest file
-cat $IN |                    # assumes saved input
+hdfs dfs -cat $IN |                    # assumes saved input
   sed 's/T..:..:..//' |         # hide times
   cut -d ',' -f 3,1 |           # keep only day and bus ID
   sort -u |                     # removing duplicate day-buses
diff --git a/evaluation/distr_benchmarks/analytics-mts/3.sh b/evaluation/distr_benchmarks/analytics-mts/3.sh
index d1bd67024..829442fc9 100755
--- a/evaluation/distr_benchmarks/analytics-mts/3.sh
+++ b/evaluation/distr_benchmarks/analytics-mts/3.sh
@@ -9,7 +9,7 @@
 # curl https://balab.aueb.gr/~dds/oasa-$(date --date='1 days ago' +'%y-%m-%d').bz2 | 
 #   bzip2 -d |                  # decompress
 # Replace the line below with the two lines above to stream the latest file
-cat $IN |                    # assumes saved input
+hdfs dfs -cat $IN |                    # assumes saved input
   sed 's/T\(..\):..:../,\1/' |  # keep times only
   cut -d ',' -f 1,2,4 |         # keep only time date and bus id
   sort -u |                     # removing duplicate entries
diff --git a/evaluation/distr_benchmarks/analytics-mts/4.sh b/evaluation/distr_benchmarks/analytics-mts/4.sh
index e77f8efdf..36c4010bc 100755
--- a/evaluation/distr_benchmarks/analytics-mts/4.sh
+++ b/evaluation/distr_benchmarks/analytics-mts/4.sh
@@ -9,7 +9,7 @@
 # curl https://balab.aueb.gr/~dds/oasa-$(date --date='1 days ago' +'%y-%m-%d').bz2 | 
 #   bzip2 -d |                  # decompress
 # Replace the line below with the two lines above to stream the latest file
-cat $IN |                    # assumes saved input
+hdfs dfs -cat $IN |                    # assumes saved input
   sed 's/T\(..\):..:../,\1/' |  # keep times only
   cut -d ',' -f 1,2 |           # keep only time and date
   sort -u |                     # removing duplicate entries
diff --git a/evaluation/distr_benchmarks/analytics-mts/input/setup.sh b/evaluation/distr_benchmarks/analytics-mts/input/setup.sh
index f010ef168..7dc7d067f 100755
--- a/evaluation/distr_benchmarks/analytics-mts/input/setup.sh
+++ b/evaluation/distr_benchmarks/analytics-mts/input/setup.sh
@@ -15,7 +15,7 @@ fi
 
 setup_dataset() {
   hdfs dfs -mkdir /analytics-mts
-  if [ ! -f ./in.csv ] && [ "$1" = "--full" ]; then
+  if [ ! -f ./in.csv ] && [ "$1" != "--small" ];; then
     # yesterday=$(date --date='1 days ago' +'%y-%m-%d')
     # curl https://www.balab.aueb.gr/~dds/oasa-$yesterday.bz2 |
     curl -sf 'https://www.balab.aueb.gr/~dds/oasa-2021-01-08.bz2' | bzip2 -d > in.csv
@@ -36,8 +36,8 @@ setup_dataset() {
 
 source_var() {
   if [[ "$1" == "--small" ]]; then
-    export IN="input/in_small.csv"
+    export IN="analytics-mts/in_small.csv"
   else
-    export IN="input/in.csv"
+    export IN="analytics-mts/in.csv"
   fi    
 }
diff --git a/evaluation/distr_benchmarks/analytics-mts/run.distr.sh b/evaluation/distr_benchmarks/analytics-mts/run.distr.sh
index 4823a1f61..9426fcbd9 100755
--- a/evaluation/distr_benchmarks/analytics-mts/run.distr.sh
+++ b/evaluation/distr_benchmarks/analytics-mts/run.distr.sh
@@ -2,9 +2,9 @@ PASH_FLAGS='--width 8 --r_split'
 export TIMEFORMAT=%R
 
 if [[ "$1" == "--small" ]]; then
-    export IN="input/in_small.csv"
+    export IN="/analytics-mts/in_small.csv"
 else
-    export IN="input/in.csv"
+    export IN="/analytics-mts/in.csv"
 fi
 
 analytics-mts_bash(){
diff --git a/evaluation/distr_benchmarks/nlp/6_1.sh b/evaluation/distr_benchmarks/nlp/6_1.sh
index 1adaca799..d0cea8ad9 100755
--- a/evaluation/distr_benchmarks/nlp/6_1.sh
+++ b/evaluation/distr_benchmarks/nlp/6_1.sh
@@ -20,8 +20,8 @@ export -f pure_func
 
 for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
 do
-    hdfs dfs -cat $IN"/"$input | grep 'the land of' | pure_func ${input} | sort -nr | sed 5q > ${OUT}/${input}.out0
-    hdfs dfs -cat $IN"/"$input | grep 'And he said' | pure_func ${input} | sort -nr | sed 5q > ${OUT}/${input}.out1
+    hdfs dfs -cat $IN/$input | grep 'the land of' | pure_func ${input} | sort -nr | sed 5q > ${OUT}/${input}.out0
+    hdfs dfs -cat $IN/$input | grep 'And he said' | pure_func ${input} | sort -nr | sed 5q > ${OUT}/${input}.out1
 done
 
 echo 'done';
diff --git a/evaluation/distr_benchmarks/nlp/run.distr.sh b/evaluation/distr_benchmarks/nlp/run.distr.sh
index b693d5065..c1285f73a 100755
--- a/evaluation/distr_benchmarks/nlp/run.distr.sh
+++ b/evaluation/distr_benchmarks/nlp/run.distr.sh
@@ -14,10 +14,10 @@ names_scripts=(
     "2syllable_words;6_5"
     "4letter_words;6_2"
     "bigrams_appear_twice;8.2_2"
-    # "bigrams;4_3"
+    "bigrams;4_3"
     "compare_exodus_genesis;8.3_3"
     "count_consonant_seq;7_2"
-    # "count_morphs;7_1"
+    "count_morphs;7_1"
     "count_trigrams;4_3b"
     "count_vowel_seq;2_2"
     "count_words;1_1"
diff --git a/evaluation/distr_benchmarks/unix50/input/setup.sh b/evaluation/distr_benchmarks/unix50/input/setup.sh
index 01d7aaa21..68b831d82 100755
--- a/evaluation/distr_benchmarks/unix50/input/setup.sh
+++ b/evaluation/distr_benchmarks/unix50/input/setup.sh
@@ -42,7 +42,7 @@ setup_dataset() {
             wget "http://ndr.md/data/unix50/${input}.txt"
             "$PASH_TOP/scripts/append_nl_if_not.sh" "${input}.txt"
         fi
-        hdfs dfs -put $file /unix50/$file
+        hdfs dfs -put "${input}.txt" /unix50/"${input}.txt"
     done
 
     # increase the original input size 10x

From 99c3dbb85a811f2253ae5be3891770c5c30cd43e Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Mon, 6 Jun 2022 17:24:33 -0400
Subject: [PATCH 23/37] port some dependency untagling scripts to hdfs

---
 .../dependency_untangling/compress_files.sh   |  20 +-
 .../dependency_untangling/encrypt_files.sh    |  20 +-
 .../dependency_untangling/img_convert.sh      |  12 +-
 .../input/install-deps.sh                     |  45 +--
 .../dependency_untangling/input/setup.sh      | 261 +++++++++---------
 .../dependency_untangling/nginx.sh            |  34 +--
 .../dependency_untangling/pcap.sh             |  29 +-
 .../dependency_untangling/run.distr.sh        |  84 +++---
 .../dependency_untangling/to_mp3.sh           |  10 +-
 9 files changed, 263 insertions(+), 252 deletions(-)

diff --git a/evaluation/distr_benchmarks/dependency_untangling/compress_files.sh b/evaluation/distr_benchmarks/dependency_untangling/compress_files.sh
index 652ce1969..d7c331e84 100755
--- a/evaluation/distr_benchmarks/dependency_untangling/compress_files.sh
+++ b/evaluation/distr_benchmarks/dependency_untangling/compress_files.sh
@@ -1,21 +1,19 @@
 #!/bin/bash
 # compress all files in a directory
-IN=${IN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/pcap_data/}
+IN=${IN:-/dependency_untangling/pcap_data/}
 OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/compress}
-LOGS=${OUT}/logs
-mkdir -p ${OUT}/logs
-run_tests() {
-    name=$(basename $1).zip
-    zip -r ${OUT}/$name $1
+
+mkdir -p ${OUT}
+pure_func() {
+    zip -r --
 }
 
-export -f run_tests
+export -f pure_func
 
-pkg_count=0
-for item in ${IN}/*;
+for item in $(hdfs dfs -ls -C ${IN});
 do
-    pkg_count=$((pkg_count + 1));
-    run_tests $item > "${LOGS}"/"$pkg_count.log"
+    output_name=$(basename $item).zip
+    hdfs dfs -cat $item | pure_func > $OUT/$output_name
 done
 
 echo 'done';
diff --git a/evaluation/distr_benchmarks/dependency_untangling/encrypt_files.sh b/evaluation/distr_benchmarks/dependency_untangling/encrypt_files.sh
index 421732513..dfec87ea9 100755
--- a/evaluation/distr_benchmarks/dependency_untangling/encrypt_files.sh
+++ b/evaluation/distr_benchmarks/dependency_untangling/encrypt_files.sh
@@ -1,20 +1,18 @@
 #!/bin/bash
 # encrypt all files in a directory 
-IN=${IN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/pcap_data}
+IN=${IN:-/dependency_untangling/pcap_data}
 OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/encrypt}
-LOGS=${OUT}/logs
-mkdir -p ${LOGS}
-run_tests() {
-    openssl enc -aes-256-cbc -pbkdf2 -iter 20000 -in $1 -out $OUT/$(basename $1).enc -k 'key'
-}
+mkdir -p ${OUT}
 
-export -f run_tests
-pkg_count=0
+pure_func() {
+    openssl enc -aes-256-cbc -pbkdf2 -iter 20000 -k 'key'
+}
+export -f pure_func
 
-for item in ${IN}/*;
+for item in $(hdfs dfs -ls -C ${IN});
 do
-    pkg_count=$((pkg_count + 1));
-    run_tests $item > ${LOGS}/${pkg_count}.log
+    output_name=$(basename $item).enc
+    hdfs dfs -cat $item | pure_func > $OUT/$output_name
 done
 
 echo 'done';
diff --git a/evaluation/distr_benchmarks/dependency_untangling/img_convert.sh b/evaluation/distr_benchmarks/dependency_untangling/img_convert.sh
index 2b87d0528..d38e474f2 100755
--- a/evaluation/distr_benchmarks/dependency_untangling/img_convert.sh
+++ b/evaluation/distr_benchmarks/dependency_untangling/img_convert.sh
@@ -1,12 +1,18 @@
 #!/bin/bash
 # tag: resize image 
-IN=${JPG:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/jpg}
+IN=${JPG:-/dependency_untangling/jpg}
 OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/jpg}
 mkdir -p ${OUT}
-for i in $IN/*.jpg; 
+
+pure_func () {
+     convert -resize 70% "-" "-"
+}
+export -f pure_func
+
+for i in $(hdfs dfs -ls -C ${IN}/*.jpg); 
 do 
     out=$OUT/$(basename -- $i)
-    convert -resize 70% "$i" "$out"; 
+    hdfs dfs -cat $i | pure_func > $out; 
 done
 
 echo 'done';
diff --git a/evaluation/distr_benchmarks/dependency_untangling/input/install-deps.sh b/evaluation/distr_benchmarks/dependency_untangling/input/install-deps.sh
index 4cb9e845a..3d4a75b1a 100755
--- a/evaluation/distr_benchmarks/dependency_untangling/input/install-deps.sh
+++ b/evaluation/distr_benchmarks/dependency_untangling/input/install-deps.sh
@@ -8,27 +8,28 @@ if ! dpkg -s $pkgs >/dev/null 2>&1 ; then
     echo 'Packages Installed'
 fi
 
-if [ ! -d ${IN}/deps/samtools-1.7 ]; then
-    cd ${IN}/deps/
-    wget https://github.com/samtools/samtools/archive/refs/tags/1.7.zip
-    unzip 1.7.zip
-    rm 1.7.zip
-    cd samtools-1.7
-    wget https://github.com/samtools/htslib/archive/refs/tags/1.7.zip
-    unzip 1.7.zip
-    autoheader            # Build config.h.in (this may generate a warning about
-    # AC_CONFIG_SUBDIRS - please ignore it).
-    autoconf -Wno-syntax  # Generate the configure script
-    ./configure           # Needed for choosing optional functionality
-    make
-    rm -rf 1.7.zip
-    echo 'Samtools installed'
-fi
+# NOT used
+# if [ ! -d ${IN}/deps/samtools-1.7 ]; then
+#     cd ${IN}/deps/
+#     wget https://github.com/samtools/samtools/archive/refs/tags/1.7.zip
+#     unzip 1.7.zip
+#     rm 1.7.zip
+#     cd samtools-1.7
+#     wget https://github.com/samtools/htslib/archive/refs/tags/1.7.zip
+#     unzip 1.7.zip
+#     autoheader            # Build config.h.in (this may generate a warning about
+#     # AC_CONFIG_SUBDIRS - please ignore it).
+#     autoconf -Wno-syntax  # Generate the configure script
+#     ./configure           # Needed for choosing optional functionality
+#     make
+#     rm -rf 1.7.zip
+#     echo 'Samtools installed'
+# fi
 
-if [ ! -f ${IN}/deps/makedeb.deb ]; then
-    cd ${IN}/deps/
-    wget http://pac-n4.csail.mit.edu:81/pash_data/makedeb.deb
-    sudo dpkg -i makedeb.deb
-    echo 'Makedeb installed'
-fi
+# if [ ! -f ${IN}/deps/makedeb.deb ]; then
+#     cd ${IN}/deps/
+#     wget http://pac-n4.csail.mit.edu:81/pash_data/makedeb.deb
+#     sudo dpkg -i makedeb.deb
+#     echo 'Makedeb installed'
+# fi
 
diff --git a/evaluation/distr_benchmarks/dependency_untangling/input/setup.sh b/evaluation/distr_benchmarks/dependency_untangling/input/setup.sh
index 58ee4bd7d..d3baf70ca 100755
--- a/evaluation/distr_benchmarks/dependency_untangling/input/setup.sh
+++ b/evaluation/distr_benchmarks/dependency_untangling/input/setup.sh
@@ -25,136 +25,141 @@ if [ "$1" == "-c" ]; then
 fi
 
 setup_dataset() {
-  if [ "$1" == "--small" ]; then
-      LOG_DATA_FILES=6
-      WAV_DATA_FILES=20
-      NODE_MODULE_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/node_modules.zip
-      BIO_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/bio.zip
-      JPG_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/jpg.zip
-      PCAP_DATA_FILES=1
-  else
-      LOG_DATA_FILES=84
-      WAV_DATA_FILES=120
-      NODE_MODULE_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/node_modules.zip
-      BIO_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/bio.zip
-      JPG_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/jpg.zip
-      PCAP_DATA_FILES=15
-  fi
-  
-  if [ ! -d ${IN}/wav ]; then
-      wget http://pac-n4.csail.mit.edu:81/pash_data/wav.zip
-      unzip wav.zip && cd wav/
-      for f in *.wav; do
-          FILE=$(basename "$f")
-          for (( i = 0; i <= $WAV_DATA_FILES; i++)) do
-              echo copying to $f$i.wav
-              cp $f $f$i.wav
-          done
-      done
-      echo "WAV Generated"
-  fi
-  
-  if [ ! -d ${IN}/jpg ]; then
-      cd ${IN}
-      wget $JPG_DATA_LINK
-      unzip jpg.zip
-      echo "JPG Generated"
-      rm -rf ${IN}/jpg.zip
-  fi
-  
-  # download the input for the nginx logs and populate the dataset
-  if [ ! -d ${IN}/log_data ]; then
-      cd $IN
-      wget http://pac-n4.csail.mit.edu:81/pash_data/nginx.zip
-      unzip nginx.zip 
-      rm nginx.zip
-      # generating analysis logs
-      mkdir -p ${IN}/log_data
-      for (( i = 1; i <=$LOG_DATA_FILES; i++)) do
-          for j in nginx-logs/*;do
-              n=$(basename $j)
-              cat $j > log_data/log${i}_${n}.log; 
-          done
-      done
-      echo "Logs Generated"
-  fi
-  
-  if [ ! -d ${IN}/bio ]; then                                                  
-      if [ "$1" = "--small" ]; then
-          # download the Genome loc file
-          wget $BIO_DATA_LINK 
-          unzip bio.zip
-          cd bio
-          wget http://pac-n4.csail.mit.edu:81/pash_data/Gene_locs.txt
-          wget http://pac-n4.csail.mit.edu:81/pash_data/small/100G.txt
-          cd ..
-          rm bio.zip
-      else
-          mkdir ${IN}/bio                                                          
-          cd ${IN}/bio                                                             
-          # download the file containing the links for the dataset                 
-          wget http://pac-n4.csail.mit.edu:81/pash_data/100G.txt                   
-          # download the Genome loc file                                           
-          wget http://pac-n4.csail.mit.edu:81/pash_data/Gene_locs.txt              
-          # start downloading the real dataset                                     
-          cat ${IN_NAME} |while read s_line;                                       
-      do                                                                       
-          echo ${IN_NAME}                                                      
-          sample=$(echo $s_line |cut -d " " -f 2);                             
-          if [[ ! -f $sample ]]; then                                          
-              pop=$(echo $s_line |cut -f 1 -d " ");                            
-              link=$(echo $s_line |cut -f 3 -d " ");                           
-              wget -O "$sample".bam  "$link"; ##this part can be adjusted maybe
-          fi                                                                   
-      done;    
-      fi                                                                           
-      echo "Genome data downloaded"
-  fi                                                                           
-  
-  # download the initial pcaps to populate the whole dataset
-  if [ ! -d ${IN}/pcap_data ]; then
-      cd $IN
-      wget http://pac-n4.csail.mit.edu:81/pash_data/pcaps.zip
-      unzip pcaps.zip
-      rm pcaps.zip
-      mkdir ${IN}/pcap_data/
-      # generates 20G
-      for (( i = 1; i <= $PCAP_DATA_FILES; i++ )) do
-          for j in ${IN}/pcaps/*;do
-              n=$(basename $j)
-              cat $j > pcap_data/pcap${i}_${n}; 
-          done
-      done
-      echo "Pcaps Generated"
-  fi 
-  
-  # download the modules for the Mir static analyses
-  if [ ! -d ${IN}/node_modules ]; then
-      cd $IN
-      wget $NODE_MODULE_LINK
-      unzip node_modules.zip 
-      rm node_modules.zip
-      # download the specific mir version
-      wget http://pac-n4.csail.mit.edu:81/pash_data/mir-sa.zip
-      unzip mir-sa.zip
-      rm mir-sa.zip
-      echo "Node modules generated"
-  fi
-  
-  # download the packages for the package building
-  if [ ! -f ${IN}/packages ]; then
-      cd $IN
-      wget http://pac-n4.csail.mit.edu:81/pash_data/packages
-      if [ "$1" = "--small" ]; then
-          head -n 20 packages > p
-          mv p  packages
-      fi
-      echo "Package datset downloaded"
-  fi
+    hdfs dfs -mkdir /dependency_untangling
+
+    if [ "$1" == "--small" ]; then
+        LOG_DATA_FILES=6
+        WAV_DATA_FILES=20
+        NODE_MODULE_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/node_modules.zip
+        BIO_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/bio.zip
+        JPG_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/jpg.zip
+        PCAP_DATA_FILES=1
+    else
+        LOG_DATA_FILES=84
+        WAV_DATA_FILES=120
+        NODE_MODULE_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/node_modules.zip
+        BIO_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/bio.zip
+        JPG_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/jpg.zip
+        PCAP_DATA_FILES=15
+    fi
+
+    if [ ! -d ${IN}/wav ]; then
+        wget http://pac-n4.csail.mit.edu:81/pash_data/wav.zip
+        unzip wav.zip && cd wav/
+        for f in *.wav; do
+            FILE=$(basename "$f")
+            for (( i = 0; i <= $WAV_DATA_FILES; i++)) do
+                echo copying to $f$i.wav
+                cp $f $f$i.wav
+            done
+        done
+        cd ..
+        hdfs dfs -put wav /dependency_untangling/wav
+        echo "WAV Generated"
+    fi
+
+    if [ ! -d ${IN}/jpg ]; then
+        cd ${IN}
+        wget $JPG_DATA_LINK
+        unzip jpg.zip
+        hdfs dfs -put jpg /dependency_untangling/jpg
+        echo "JPG Generated"
+        rm -rf ${IN}/jpg.zip
+    fi
+
+    # download the input for the nginx logs and populate the dataset
+    if [ ! -d ${IN}/log_data ]; then
+        cd $IN
+        wget http://pac-n4.csail.mit.edu:81/pash_data/nginx.zip
+        unzip nginx.zip 
+        rm nginx.zip
+        # generating analysis logs
+        mkdir -p ${IN}/log_data
+        for (( i = 1; i <=$LOG_DATA_FILES; i++)) do
+            for j in nginx-logs/*;do
+                n=$(basename $j)
+                cat $j > log_data/log${i}_${n}.log; 
+            done
+        done
+        hdfs dfs -put log_data /dependency_untangling/log_data
+        echo "Logs Generated"
+    fi
+
+    # if [ ! -d ${IN}/bio ]; then                                                  
+    #     if [ "$1" = "--small" ]; then
+    #         # download the Genome loc file
+    #         wget $BIO_DATA_LINK 
+    #         unzip bio.zip
+    #         cd bio
+    #         wget http://pac-n4.csail.mit.edu:81/pash_data/Gene_locs.txt
+    #         wget http://pac-n4.csail.mit.edu:81/pash_data/small/100G.txt
+    #         cd ..
+    #         rm bio.zip
+    #     else
+    #         mkdir ${IN}/bio                                                          
+    #         cd ${IN}/bio                                                             
+    #         # download the file containing the links for the dataset                 
+    #         wget http://pac-n4.csail.mit.edu:81/pash_data/100G.txt                   
+    #         # download the Genome loc file                                           
+    #         wget http://pac-n4.csail.mit.edu:81/pash_data/Gene_locs.txt              
+    #         # start downloading the real dataset                                     
+    #         cat ${IN_NAME} |while read s_line;                                       
+    #     do                                                                       
+    #         echo ${IN_NAME}                                                      
+    #         sample=$(echo $s_line |cut -d " " -f 2);                             
+    #         if [[ ! -f $sample ]]; then                                          
+    #             pop=$(echo $s_line |cut -f 1 -d " ");                            
+    #             link=$(echo $s_line |cut -f 3 -d " ");                           
+    #             wget -O "$sample".bam  "$link"; ##this part can be adjusted maybe
+    #         fi                                                                   
+    #     done;    
+    #     fi                                                                           
+    #     echo "Genome data downloaded"
+    # fi                                                                           
+    
+    # download the initial pcaps to populate the whole dataset
+    if [ ! -d ${IN}/pcap_data ]; then
+        cd $IN
+        wget http://pac-n4.csail.mit.edu:81/pash_data/pcaps.zip
+        unzip pcaps.zip
+        rm pcaps.zip
+        mkdir ${IN}/pcap_data/
+        # generates 20G
+        for (( i = 1; i <= $PCAP_DATA_FILES; i++ )) do
+            for j in ${IN}/pcaps/*;do
+                n=$(basename $j)
+                cat $j > pcap_data/pcap${i}_${n}; 
+            done
+        done
+        hdfs dfs -put pcap_data /dependency_untangling/pcap_data
+        echo "Pcaps Generated"
+    fi 
+    
+    # # download the modules for the Mir static analyses
+    # if [ ! -d ${IN}/node_modules ]; then
+    #     cd $IN
+    #     wget $NODE_MODULE_LINK
+    #     unzip node_modules.zip 
+    #     rm node_modules.zip
+    #     # download the specific mir version
+    #     wget http://pac-n4.csail.mit.edu:81/pash_data/mir-sa.zip
+    #     unzip mir-sa.zip
+    #     rm mir-sa.zip
+    #     echo "Node modules generated"
+    # fi
+
+    # # download the packages for the package building
+    # if [ ! -f ${IN}/packages ]; then
+    #     cd $IN
+    #     wget http://pac-n4.csail.mit.edu:81/pash_data/packages
+    #     if [ "$1" = "--small" ]; then
+    #         head -n 20 packages > p
+    #         mv p  packages
+    #     fi
+    #     echo "Package datset downloaded"
+    # fi
 }
 
 source_var() {
   export IN=
 }
-
-setup_dataset
diff --git a/evaluation/distr_benchmarks/dependency_untangling/nginx.sh b/evaluation/distr_benchmarks/dependency_untangling/nginx.sh
index afd53af8e..680de995e 100755
--- a/evaluation/distr_benchmarks/dependency_untangling/nginx.sh
+++ b/evaluation/distr_benchmarks/dependency_untangling/nginx.sh
@@ -1,36 +1,38 @@
 #!/bin/bash
 # tag: nginx logs
-IN=${IN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/log_data}
+IN=${IN:-/dependency_untangling/log_data}
 OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/nginx-logs}
 mkdir -p ${OUT}
 
-run_tests() {
-    # i don't think we should assign things to $0, however, it works with both
-    IN=$1
-    cat $IN | cut -d "\"" -f3 | cut -d ' ' -f2 | sort | uniq -c | sort -rn   
+pure_func() {
+    tempfile=$(mktemp)
+
+    tee $tempfile | cut -d "\"" -f3 | cut -d ' ' -f2 | sort | uniq -c | sort -rn   
     # awk alternative, too slow
-    awk '{print $9}' $IN | sort | uniq -c | sort -rn  
+    awk '{print $9}' $tempfile | sort | uniq -c | sort -rn  
     # find broken links broken links
-    awk '($9 ~ /404/)' $IN | awk '{print $7}' | sort | uniq -c | sort -rn  
+    awk '($9 ~ /404/)' $tempfile | awk '{print $7}' | sort | uniq -c | sort -rn  
     # for 502 (bad-gateway) we can run following command:
-    awk '($9 ~ /502/)' $IN | awk '{print $7}' | sort | uniq -c | sort -r  
+    awk '($9 ~ /502/)' $tempfile | awk '{print $7}' | sort | uniq -c | sort -r  
     # Who are requesting broken links (or URLs resulting in 502)
-    awk -F\" '($2 ~ "/wp-admin/install.php"){print $1}' $IN | awk '{print $1}' | sort | uniq -c | sort -r  
+    awk -F\" '($2 ~ "/wp-admin/install.php"){print $1}' $tempfile | awk '{print $1}' | sort | uniq -c | sort -r  
     # 404 for php files -mostly hacking attempts
-    awk '($9 ~ /404/)' $IN | awk -F\" '($2 ~ "^GET .*.php")' | awk '{print $7}' | sort | uniq -c | sort -r | head -n 20  
+    awk '($9 ~ /404/)' $tempfile | awk -F\" '($2 ~ "^GET .*.php")' | awk '{print $7}' | sort | uniq -c | sort -r | head -n 20  
     ##############################
     # Most requested URLs ########
-    awk -F\" '{print $2}' $IN  | awk '{print $2}' | sort | uniq -c | sort -r  
+    awk -F\" '{print $2}' $tempfile  | awk '{print $2}' | sort | uniq -c | sort -r  
     # Most requested URLs containing XYZ
-    awk -F\" '($2 ~ "ref"){print $2}' $IN | awk '{print $2}' | sort | uniq -c | sort -r
+    awk -F\" '($2 ~ "ref"){print $2}' $tempfile | awk '{print $2}' | sort | uniq -c | sort -r
+
+    rm $tempfile
 }
+export -f pure_func
 
-export -f run_tests
-for f in ${IN}/*; do
+for log in $(hdfs dfs -ls -C ${IN}); do
     #bash -c 'run_tests $0 $1' $f $f #> /dev/null
     #run_tests $f > /dev/null
-    logname=$OUT/$(basename $f)
-    run_tests $f > $logname
+    logname=$OUT/$(basename $log)
+    hdfs dfs -cat $log | pure_func > $logname
 done
 
 echo 'done';
diff --git a/evaluation/distr_benchmarks/dependency_untangling/pcap.sh b/evaluation/distr_benchmarks/dependency_untangling/pcap.sh
index d4e1b70ea..13a0cd29e 100755
--- a/evaluation/distr_benchmarks/dependency_untangling/pcap.sh
+++ b/evaluation/distr_benchmarks/dependency_untangling/pcap.sh
@@ -1,25 +1,26 @@
 #!/bin/bash
 #tag: pcap analysis
-IN=${IN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/pcap_data}
+IN=${IN:-/dependency_untangling/pcap_data}
 OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/pcap-analysis}
-LOGS=${OUT}/logs
-mkdir -p ${LOGS}
-run_tests() {
-    INPUT=$1
-    /usr/sbin/tcpdump -nn -r ${INPUT} -A 'port 53' 2> /dev/null | sort | uniq |grep -Ev '(com|net|org|gov|mil|arpa)' 2> /dev/null
+mkdir -p $OUT
+
+pure_func() {
+    tempfile=$(mktemp)
+
+    tee $tempfile | tcpdump -nn -r '-' -A 'port 53' 2> /dev/null | sort | uniq |grep -Ev '(com|net|org|gov|mil|arpa)' 2> /dev/null
     # extract URL
-    /usr/sbin/tcpdump -nn -r ${INPUT} -s 0 -v -n -l 2> /dev/null | egrep -i "POST /|GET /|Host:" 2> /dev/null
+    tcpdump -nn -r $tempfile -s 0 -v -n -l 2> /dev/null | egrep -i "POST /|GET /|Host:" 2> /dev/null
     # extract passwords
-    /usr/sbin/tcpdump -nn -r ${INPUT} -s 0 -A -n -l 2> /dev/null | egrep -i "POST /|pwd=|passwd=|password=|Host:" 2> /dev/null
-}
-export -f run_tests
+    tcpdump -nn -r $tempfile -s 0 -A -n -l 2> /dev/null | egrep -i "POST /|pwd=|passwd=|password=|Host:" 2> /dev/null
 
-pkg_count=0
+    rm -f $tempfile
+}
+export -f pure_func
 
-for item in ${IN}/*;
+for item in $(hdfs dfs -ls -C ${IN});
 do
-    pkg_count=$((pkg_count + 1));
-    run_tests $item > ${LOGS}/${pkg_count}.log
+    logname=$OUT/$(basename $item).log;
+    hdfs dfs -cat $item | pure_func > $logname
 done
 
 echo 'done';
diff --git a/evaluation/distr_benchmarks/dependency_untangling/run.distr.sh b/evaluation/distr_benchmarks/dependency_untangling/run.distr.sh
index 8928ed6be..6cdaa9c84 100755
--- a/evaluation/distr_benchmarks/dependency_untangling/run.distr.sh
+++ b/evaluation/distr_benchmarks/dependency_untangling/run.distr.sh
@@ -1,50 +1,47 @@
-PASH_FLAGS='--width 6 --r_split'
+PASH_FLAGS='--width 8 --r_split'
 export TIMEFORMAT=%R
-export dict="$PASH_TOP/evaluation/benchmarks/oneliners/input/dict.txt"
 
 names_scripts=(
     "MediaConv1;img_convert"
     "MediaConv2;to_mp3"
-    "Program_Inference;proginf"
+    # "Program_Inference;proginf"
     "LogAnalysis1;nginx"
     "LogAnalysis2;pcap"
     # "Genomics_Computation;genomics"
-    "AurPkg;pacaur"
+    # "AurPkg;pacaur"
     "FileEnc1;compress_files"
     "FileEnc2;encrypt_files"
   )
 
-oneliners_bash() {
-    seq_times_file="seq.res"
-    seq_outputs_suffix="seq.out"
-    outputs_dir="outputs"
-
-    mkdir -p "$outputs_dir"
+dependency_untangling_bash() {
+  outputs_dir="outputs"
+  times_file="seq.res"
+  outputs_suffix="seq.out"
 
-    touch "$seq_times_file"
-    cat $seq_times_file > $seq_times_file.d
-    echo executing one-liners $(date) | tee -a "$seq_times_file"
-    echo '' > "$seq_times_file"
+  rm -rf input/output
+  mkdir -p "$outputs_dir"
 
-    for name_script in ${names_scripts[@]}
-    do
+  touch "$times_file"
+  cat "$times_file" > "$times_file".d
+  echo executing dependency_untangling $(date) | tee "$times_file"
+  echo '' >> "$times_file"
+  
+  export IN= 
+  for name_script in ${names_scripts[@]}
+  do
     IFS=";" read -r -a name_script_parsed <<< "${name_script}"
     name="${name_script_parsed[0]}"
     script="${name_script_parsed[1]}"
-    export IN=
-    export OUT=
-
     printf -v pad %30s
-    padded_script="${script}${pad}"
+    padded_script="${name}.sh:${pad}"
     padded_script=${padded_script:0:30}
-
-    seq_outputs_file="${outputs_dir}/${script}.${seq_outputs_suffix}"
-
-    echo "${padded_script}" $({ time ./${script}.sh > "$seq_outputs_file"; } 2>&1) | tee -a "$seq_times_file"
-    done
+    outputs_file="${outputs_dir}/${script}.${outputs_suffix}"
+    echo "${padded_script}" $({ time ./${script}.sh > "$outputs_file"; } 2>&1) | tee -a "$times_file"
+  done
 }
 
-oneliners_pash(){
+
+dependency_untangling_pash() {
   flags=${1:-$PASH_FLAGS}
   prefix=${2:-par}
 
@@ -54,37 +51,42 @@ oneliners_pash(){
   outputs_dir="outputs"
   pash_logs_dir="pash_logs_$prefix"
 
+  rm -rf input/output/
+
   mkdir -p "$outputs_dir"
   mkdir -p "$pash_logs_dir"
 
   touch "$times_file"
-  cat $times_file > $times_file.d
-  echo executing one-liners with $prefix pash $(date) | tee -a "$times_file"
-  echo '' > "$times_file"
-
+  cat "$times_file" > "$times_file".d
+  echo executing dependency_untangling with pash $(date) | tee "$times_file"
+  echo '' >> "$times_file"
+  
+  export IN=
   for name_script in ${names_scripts[@]}
-    do
+  do
     IFS=";" read -r -a name_script_parsed <<< "${name_script}"
     name="${name_script_parsed[0]}"
     script="${name_script_parsed[1]}"
-
-    export IN=
-    export OUT=
-    
     printf -v pad %30s
-    padded_script="${script}${pad}"
+    padded_script="${name}.sh:${pad}"
     padded_script=${padded_script:0:30}
-
     outputs_file="${outputs_dir}/${script}.${outputs_suffix}"
     pash_log="${pash_logs_dir}/${script}.pash.log"
     single_time_file="${outputs_dir}/${script}.${time_suffix}"
-
+    
     echo -n "${padded_script}" | tee -a "$times_file"
     { time "$PASH_TOP/pa.sh" $flags --log_file "${pash_log}" ${script}.sh > "$outputs_file"; } 2> "${single_time_file}"
     cat "${single_time_file}" | tee -a "$times_file"
   done
 }
 
-# oneliners_bash
-oneliners_pash "$PASH_FLAGS" "par"
-# oneliners_pash "$PASH_FLAGS --distributed_exec" "distr"
+dependency_untangling_bash
+
+dependency_untangling_pash "$PASH_FLAGS" "par_no_du"
+
+dependency_untangling_pash "$PASH_FLAGS --parallel_pipelines --parallel_pipelines_limit 24" "par"
+
+dependency_untangling_pash "$PASH_FLAGS --distributed_exec" "distr_no_du"
+
+dependency_untangling_pash "$PASH_FLAGS --parallel_pipelines --distributed_exec --parallel_pipelines_limit 24" "distr"
+
diff --git a/evaluation/distr_benchmarks/dependency_untangling/to_mp3.sh b/evaluation/distr_benchmarks/dependency_untangling/to_mp3.sh
index c94a75b49..3b0187d14 100755
--- a/evaluation/distr_benchmarks/dependency_untangling/to_mp3.sh
+++ b/evaluation/distr_benchmarks/dependency_untangling/to_mp3.sh
@@ -1,17 +1,15 @@
 #!/bin/bash
 # tag: wav-to-mp3
-IN=${IN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/wav}
+IN=${IN:-/dependency_untangling/wav}
 OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/mp3}
-LOGS=${OUT}/logs
-mkdir -p ${LOGS}
+mkdir -p ${OUT}
+
 pure_func(){
     ffmpeg -y -i pipe:0 -f mp3 -ab 192000 pipe:1  2>/dev/null
 }
-
 export -f pure_func
 
-pkg_count=0
-for item in $(hdfs dfs -ls -C /for-loops/wav);
+for item in $(hdfs dfs -ls -C $IN);
 do
     pkg_count=$((pkg_count + 1));
     out="$OUT/$(basename $item).mp3"

From 6f0f5f303a4959de6a506626f792c7caf77702d3 Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Mon, 6 Jun 2022 17:28:41 -0400
Subject: [PATCH 24/37] small changes to eval scripts

---
 evaluation/distr_benchmarks/analytics-mts/run.distr.sh    | 4 ++--
 .../distr_benchmarks/dependency_untangling/run.distr.sh   | 4 ++--
 evaluation/distr_benchmarks/nlp/run.distr.sh              | 8 ++++----
 evaluation/distr_benchmarks/oneliners/run.distr.sh        | 4 ++--
 evaluation/distr_benchmarks/unix50/run.distr.sh           | 4 ++--
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/evaluation/distr_benchmarks/analytics-mts/run.distr.sh b/evaluation/distr_benchmarks/analytics-mts/run.distr.sh
index 9426fcbd9..23c66af1d 100755
--- a/evaluation/distr_benchmarks/analytics-mts/run.distr.sh
+++ b/evaluation/distr_benchmarks/analytics-mts/run.distr.sh
@@ -15,7 +15,7 @@ analytics-mts_bash(){
   mkdir -p "$outputs_dir"
 
   touch "$times_file"
-  cat "$times_file" > "$times_file".d
+  cat "$times_file" >> "$times_file".d
   echo executing MTS analytics $(date) | tee "$times_file"
   echo '' >> "$times_file"
   ## FIXME 5.sh is not working yet
@@ -47,7 +47,7 @@ analytics-mts_pash(){
   mkdir -p "$pash_logs_dir"
 
   touch "$times_file"
-  cat "$times_file" > "$times_file".d
+  cat "$times_file" >> "$times_file".d
   echo executing MTS analytics with pash $(date) | tee "$times_file"
   echo '' >> "$times_file"
   ## FIXME 5.sh is not working yet
diff --git a/evaluation/distr_benchmarks/dependency_untangling/run.distr.sh b/evaluation/distr_benchmarks/dependency_untangling/run.distr.sh
index 6cdaa9c84..29ab821cc 100755
--- a/evaluation/distr_benchmarks/dependency_untangling/run.distr.sh
+++ b/evaluation/distr_benchmarks/dependency_untangling/run.distr.sh
@@ -22,7 +22,7 @@ dependency_untangling_bash() {
   mkdir -p "$outputs_dir"
 
   touch "$times_file"
-  cat "$times_file" > "$times_file".d
+  cat "$times_file" >> "$times_file".d
   echo executing dependency_untangling $(date) | tee "$times_file"
   echo '' >> "$times_file"
   
@@ -57,7 +57,7 @@ dependency_untangling_pash() {
   mkdir -p "$pash_logs_dir"
 
   touch "$times_file"
-  cat "$times_file" > "$times_file".d
+  cat "$times_file" >> "$times_file".d
   echo executing dependency_untangling with pash $(date) | tee "$times_file"
   echo '' >> "$times_file"
   
diff --git a/evaluation/distr_benchmarks/nlp/run.distr.sh b/evaluation/distr_benchmarks/nlp/run.distr.sh
index c1285f73a..a77c00346 100755
--- a/evaluation/distr_benchmarks/nlp/run.distr.sh
+++ b/evaluation/distr_benchmarks/nlp/run.distr.sh
@@ -35,7 +35,7 @@ names_scripts=(
     "words_no_vowels;6_3"
   )
 
-bash_nlp(){
+nlp_bash(){
   outputs_dir="outputs"
   times_file="seq.res"
   outputs_suffix="seq.out"
@@ -43,7 +43,7 @@ bash_nlp(){
   mkdir -p "$outputs_dir"
 
   touch "$times_file"
-  cat "$times_file" > "$times_file".d
+  cat "$times_file" >> "$times_file".d
   echo executing Unix-for-nlp $(date) | tee "$times_file"
   echo '' >> "$times_file"
 
@@ -76,7 +76,7 @@ nlp_pash(){
   mkdir -p "$pash_logs_dir"
 
   touch "$times_file"
-  cat "$times_file" > "$times_file".d
+  cat "$times_file" >> "$times_file".d
   echo executing Unix-for-nlp with $prefix pash $(date) | tee "$times_file"
   echo '' >> "$times_file"
 
@@ -99,7 +99,7 @@ nlp_pash(){
   done
 }
 
-bash_nlp
+nlp_bash
 
 nlp_pash "$PASH_FLAGS" "par_no_du"
 
diff --git a/evaluation/distr_benchmarks/oneliners/run.distr.sh b/evaluation/distr_benchmarks/oneliners/run.distr.sh
index 6eeaf36ac..5305426f6 100755
--- a/evaluation/distr_benchmarks/oneliners/run.distr.sh
+++ b/evaluation/distr_benchmarks/oneliners/run.distr.sh
@@ -26,7 +26,7 @@ oneliners_bash() {
     mkdir -p "$outputs_dir"
 
     touch "$seq_times_file"
-    cat $seq_times_file > $seq_times_file.d
+    cat $seq_times_file >> $seq_times_file.d
     echo executing one-liners $(date) | tee "$seq_times_file"
     echo '' >> "$seq_times_file"
 
@@ -65,7 +65,7 @@ oneliners_pash(){
   mkdir -p "$pash_logs_dir"
 
   touch "$times_file"
-  cat $times_file > $times_file.d
+  cat $times_file >> $times_file.d
   echo executing one-liners with $prefix pash with data $rep $(date) | tee "$times_file"
   echo '' >> "$times_file"
 
diff --git a/evaluation/distr_benchmarks/unix50/run.distr.sh b/evaluation/distr_benchmarks/unix50/run.distr.sh
index 2526bbbe4..c4dd9149d 100755
--- a/evaluation/distr_benchmarks/unix50/run.distr.sh
+++ b/evaluation/distr_benchmarks/unix50/run.distr.sh
@@ -16,7 +16,7 @@ unix50_bash(){
   mkdir -p "$outputs_dir"
 
   touch "$times_file"
-  cat "$times_file" > "$times_file".d
+  cat "$times_file" >> "$times_file".d
   echo executing Unix50 $(date) | tee "$times_file"
   echo '' >> "$times_file"
 
@@ -49,7 +49,7 @@ unix50_pash(){
   mkdir -p "$pash_logs_dir"
 
   touch "$times_file"
-  cat "$times_file" > "$times_file".d
+  cat "$times_file" >> "$times_file".d
   echo executing Unix50 $(date) | tee "$times_file"
   echo '' >> "$times_file"
 

From 9d824998f8d03c6541084589d6275532166b7e90 Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Tue, 7 Jun 2022 13:20:21 +0000
Subject: [PATCH 25/37] improve oneliners eval and setup scripts

---
 .../analytics-mts/input/setup.sh              | 44 +++++++------------
 evaluation/distr_benchmarks/oneliners/diff.sh |  2 +-
 .../distr_benchmarks/oneliners/input/setup.sh | 32 +++++---------
 .../distr_benchmarks/oneliners/nfa-regex.sh   |  2 +-
 .../distr_benchmarks/oneliners/run.distr.sh   | 23 ++++------
 .../distr_benchmarks/oneliners/set-diff.sh    |  2 +-
 .../oneliners/shortest-scripts.sh             |  2 +-
 .../distr_benchmarks/oneliners/sort-sort.sh   |  2 +-
 evaluation/distr_benchmarks/oneliners/sort.sh |  2 +-
 .../distr_benchmarks/oneliners/spell.sh       |  2 +-
 .../distr_benchmarks/oneliners/top-n.sh       |  2 +-
 11 files changed, 44 insertions(+), 71 deletions(-)

diff --git a/evaluation/distr_benchmarks/analytics-mts/input/setup.sh b/evaluation/distr_benchmarks/analytics-mts/input/setup.sh
index 7dc7d067f..df6ce23ca 100755
--- a/evaluation/distr_benchmarks/analytics-mts/input/setup.sh
+++ b/evaluation/distr_benchmarks/analytics-mts/input/setup.sh
@@ -13,31 +13,21 @@ if [[ "$1" == "-c" ]]; then
     exit
 fi
 
-setup_dataset() {
-  hdfs dfs -mkdir /analytics-mts
-  if [ ! -f ./in.csv ] && [ "$1" != "--small" ];; then
-    # yesterday=$(date --date='1 days ago' +'%y-%m-%d')
-    # curl https://www.balab.aueb.gr/~dds/oasa-$yesterday.bz2 |
-    curl -sf 'https://www.balab.aueb.gr/~dds/oasa-2021-01-08.bz2' | bzip2 -d > in.csv
-    if [ $? -ne 0 ]; then
-      echo "oasa-2021-01-08.bz2 / bzip2 not available, contact the pash authors"
-      exit 1
-    fi
-    hdfs dfs -put in.csv  /analytics-mts/in.csv
-  elif [ ! -f ./in_small.csv ] && [ "$1" = "--small" ]; then
-    if [ ! -f ./in_small.csv ]; then                                                       
-      echo "Generating small-size inputs"                                                  
-      # FIXME PR: Do we need all of them?                                                  
-      curl -sf 'http://pac-n4.csail.mit.edu:81/pash_data/small/in_small.csv' > in_small.csv
-    fi
-    hdfs dfs -put in_small.csv  /analytics-mts/in_small.csv                                                                                     
+hdfs dfs -mkdir /analytics-mts
+if [ ! -f ./in.csv ] && [ "$1" != "--small" ]; then
+  # yesterday=$(date --date='1 days ago' +'%y-%m-%d')
+  # curl https://www.balab.aueb.gr/~dds/oasa-$yesterday.bz2 |
+  curl -sf 'https://www.balab.aueb.gr/~dds/oasa-2021-01-08.bz2' | bzip2 -d > in.csv
+  if [ $? -ne 0 ]; then
+    echo "oasa-2021-01-08.bz2 / bzip2 not available, contact the pash authors"
+    exit 1
   fi
-}
-
-source_var() {
-  if [[ "$1" == "--small" ]]; then
-    export IN="analytics-mts/in_small.csv"
-  else
-    export IN="analytics-mts/in.csv"
-  fi    
-}
+  hdfs dfs -put in.csv  /analytics-mts/in.csv
+elif [ ! -f ./in_small.csv ] && [ "$1" = "--small" ]; then
+  if [ ! -f ./in_small.csv ]; then                                                       
+    echo "Generating small-size inputs"                                                  
+    # FIXME PR: Do we need all of them?                                                  
+    curl -sf 'http://pac-n4.csail.mit.edu:81/pash_data/small/in_small.csv' > in_small.csv
+  fi
+  hdfs dfs -put in_small.csv  /analytics-mts/in_small.csv                                                                                     
+fi
diff --git a/evaluation/distr_benchmarks/oneliners/diff.sh b/evaluation/distr_benchmarks/oneliners/diff.sh
index 9435ad1d7..5b771e394 100755
--- a/evaluation/distr_benchmarks/oneliners/diff.sh
+++ b/evaluation/distr_benchmarks/oneliners/diff.sh
@@ -3,7 +3,7 @@
 # Taken from https://crashingdaily.wordpress.com/2008/03/06/diff-two-stdout-streams/
 # shuf() { awk 'BEGIN {srand(); OFMT="%.17f"} {print rand(), $0}' "$@" | sort -k1,1n | cut -d ' ' -f2-; }
 
-IN=${IN:-/1G.txt}
+IN=${IN:-/oneliners/1G.txt}
 
 mkfifo s1 s2
 
diff --git a/evaluation/distr_benchmarks/oneliners/input/setup.sh b/evaluation/distr_benchmarks/oneliners/input/setup.sh
index 3d4921c22..a24725912 100755
--- a/evaluation/distr_benchmarks/oneliners/input/setup.sh
+++ b/evaluation/distr_benchmarks/oneliners/input/setup.sh
@@ -13,6 +13,8 @@ if [[ "$1" == "-c" ]]; then
     exit
 fi
 
+hdfs dfs -mkdir /oneliners
+
 if [ ! -f ./1M.txt ]; then
     curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt
     if [ $? -ne 0 ]; then
@@ -67,30 +69,16 @@ if [ ! -f ./all_cmdsx100.txt ]; then
         done
 fi
 
-
-if [ "$#" -eq 1 ] && [ "$1" = "--full" ]; then
-    echo "Generating full-size inputs"
-
-
-    if [ ! -f ./3G.txt ]; then
-        touch 3G.txt
-        for (( i = 0; i < 3; i++ )); do
-            cat 1G.txt >> 3G.txt
-        done
-    fi
-    input_files+=("3G.txt")
-
-    if [ ! -f ./10G.txt ]; then
-        touch 10G.txt
-        for (( i = 0; i < 10; i++ )); do
-            cat 1G.txt >> 10G.txt
-        done
-    fi
-    input_files+=("10G.txt")
+if [ ! -f ./3G.txt ]; then
+    touch 3G.txt
+    for (( i = 0; i < 3; i++ )); do
+        cat 1G.txt >> 3G.txt
+    done
 fi
+input_files+=("3G.txt")
 
 # Add files with different replication factors
 for file in "${input_files[@]}"; do
-    hdfs dfs -Ddfs.replication=1  -put $file /rep1_$file
-    hdfs dfs -Ddfs.replication=3  -put $file /rep3_$file
+    hdfs dfs -put $file /oneliners/$file
+    rm -f $file
 done
\ No newline at end of file
diff --git a/evaluation/distr_benchmarks/oneliners/nfa-regex.sh b/evaluation/distr_benchmarks/oneliners/nfa-regex.sh
index 2a2c30718..2594da3eb 100755
--- a/evaluation/distr_benchmarks/oneliners/nfa-regex.sh
+++ b/evaluation/distr_benchmarks/oneliners/nfa-regex.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 # Match complex regular-expression over input
 
-IN=${IN:-/1G.txt}
+IN=${IN:-/oneliners/1G.txt}
 
 hdfs dfs -cat $IN | tr A-Z a-z | grep '\(.\).*\1\(.\).*\2\(.\).*\3\(.\).*\4'
diff --git a/evaluation/distr_benchmarks/oneliners/run.distr.sh b/evaluation/distr_benchmarks/oneliners/run.distr.sh
index 5305426f6..95adff56b 100755
--- a/evaluation/distr_benchmarks/oneliners/run.distr.sh
+++ b/evaluation/distr_benchmarks/oneliners/run.distr.sh
@@ -19,9 +19,8 @@ scripts_inputs=(
 
 oneliners_bash() {
     outputs_dir="outputs"
-    rep=${1:-rep3}
-    seq_times_file=$rep"_seq.res"
-    seq_outputs_suffix=$rep"_seq.out"
+    seq_times_file="seq.res"
+    seq_outputs_suffix="seq.out"
 
     mkdir -p "$outputs_dir"
 
@@ -36,7 +35,7 @@ oneliners_bash() {
     script="${script_input_parsed[0]}"
     input="${script_input_parsed[1]}"
 
-    export IN=/$rep\_$input
+    export IN="/oneliners/$input"
     export dict=
 
     printf -v pad %30s
@@ -52,8 +51,7 @@ oneliners_bash() {
 oneliners_pash(){
   flags=${1:-$PASH_FLAGS}
   prefix=${2:-par}
-  rep=${3:-rep3}
-  prefix=$prefix\_$rep
+  prefix=$prefix
 
   times_file="$prefix.res"
   outputs_suffix="$prefix.out"
@@ -66,7 +64,7 @@ oneliners_pash(){
 
   touch "$times_file"
   cat $times_file >> $times_file.d
-  echo executing one-liners with $prefix pash with data $rep $(date) | tee "$times_file"
+  echo executing one-liners with $prefix pash with data $(date) | tee "$times_file"
   echo '' >> "$times_file"
 
   for script_input in ${scripts_inputs[@]}
@@ -75,7 +73,7 @@ oneliners_pash(){
     script="${script_input_parsed[0]}"
     input="${script_input_parsed[1]}"
 
-    export IN=/$rep\_$input
+    export IN="/oneliners/$input"
     export dict=
 
     printf -v pad %30s
@@ -92,11 +90,8 @@ oneliners_pash(){
   done
 }
 
-# oneliners_bash "rep1"
-oneliners_bash "rep3"
+oneliners_bash
 
-# oneliners_pash "$PASH_FLAGS" "par" "rep1"
-oneliners_pash "$PASH_FLAGS" "par" "rep3"
+oneliners_pash "$PASH_FLAGS" "par"
 
-# oneliners_pash "$PASH_FLAGS --distributed_exec" "distr" "rep1"
-oneliners_pash "$PASH_FLAGS --distributed_exec" "distr" "rep3"
+oneliners_pash "$PASH_FLAGS --distributed_exec" "distr"
diff --git a/evaluation/distr_benchmarks/oneliners/set-diff.sh b/evaluation/distr_benchmarks/oneliners/set-diff.sh
index 039e6996f..715488315 100755
--- a/evaluation/distr_benchmarks/oneliners/set-diff.sh
+++ b/evaluation/distr_benchmarks/oneliners/set-diff.sh
@@ -2,7 +2,7 @@
 # Show the set-difference between two streams (i.e., elements in the first that are not in the second).
 # https://stackoverflow.com/questions/2509533/bash-linux-set-difference-between-two-text-files
 
-IN=${IN:-/1G.txt}
+IN=${IN:-/oneliners/1G.txt}
 
 mkfifo s1 s2
 
diff --git a/evaluation/distr_benchmarks/oneliners/shortest-scripts.sh b/evaluation/distr_benchmarks/oneliners/shortest-scripts.sh
index 63a5bc3d9..b8999923b 100755
--- a/evaluation/distr_benchmarks/oneliners/shortest-scripts.sh
+++ b/evaluation/distr_benchmarks/oneliners/shortest-scripts.sh
@@ -6,6 +6,6 @@
 
 # FIX: Input here should be a set of commands, more precisely, the ones on this specific machine.
 
-IN=${IN:-/all_cmdsx100.txt}
+IN=${IN:-/oneliners/all_cmdsx100.txt}
 
 hdfs dfs -cat $IN | xargs file | grep "shell script" | cut -d: -f1 | xargs -L 1 wc -l | grep -v '^0$' | sort -n | head -15
diff --git a/evaluation/distr_benchmarks/oneliners/sort-sort.sh b/evaluation/distr_benchmarks/oneliners/sort-sort.sh
index 7b51ed889..16c372abc 100755
--- a/evaluation/distr_benchmarks/oneliners/sort-sort.sh
+++ b/evaluation/distr_benchmarks/oneliners/sort-sort.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 # Calculate sort twice
 
-IN=${IN:-/1G.txt}
+IN=${IN:-/oneliners/1G.txt}
 
 hdfs dfs -cat $IN | tr A-Z a-z | sort | sort -r
diff --git a/evaluation/distr_benchmarks/oneliners/sort.sh b/evaluation/distr_benchmarks/oneliners/sort.sh
index 29cffa1cf..359701649 100755
--- a/evaluation/distr_benchmarks/oneliners/sort.sh
+++ b/evaluation/distr_benchmarks/oneliners/sort.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # Sort input
 
-IN=${IN:-/1G.txt}
+IN=${IN:-/oneliners/1G.txt}
 
 hdfs dfs -cat $IN | sort
 
diff --git a/evaluation/distr_benchmarks/oneliners/spell.sh b/evaluation/distr_benchmarks/oneliners/spell.sh
index 7928babe4..c8b2ddaa9 100755
--- a/evaluation/distr_benchmarks/oneliners/spell.sh
+++ b/evaluation/distr_benchmarks/oneliners/spell.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # Calculate mispelled words in an input
 # https://dl.acm.org/doi/10.1145/3532.315102
-IN=${IN:-/1G.txt}
+IN=${IN:-/oneliners/1G.txt}
 dict=${dict:-$PASH_TOP/evaluation/distr_benchmarks/oneliners/input/dict.txt}
 
 hdfs dfs -cat $IN |
diff --git a/evaluation/distr_benchmarks/oneliners/top-n.sh b/evaluation/distr_benchmarks/oneliners/top-n.sh
index c2f7f2b21..ba2b4eb8d 100755
--- a/evaluation/distr_benchmarks/oneliners/top-n.sh
+++ b/evaluation/distr_benchmarks/oneliners/top-n.sh
@@ -2,7 +2,7 @@
 # Top-N (1000) terms
 # from https://dl.acm.org/doi/10.1145/5948.315654
 
-IN=${IN:-/1G.txt}
+IN=${IN:-/oneliners/1G.txt}
 
 hdfs dfs -cat $IN | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | tr A-Z a-z | sort | uniq -c | sort -rn | sed 100q
 

From c6c2c387360bbf6910a87807ff44f72cfb84b445 Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Tue, 7 Jun 2022 21:24:24 +0000
Subject: [PATCH 26/37] fix du installation scripts

---
 .../input/install-deps.sh                     |  48 ++--
 .../dependency_untangling/input/setup.sh      | 250 +++++++++---------
 .../distr_benchmarks/install_all_deps.sh      |   3 +
 3 files changed, 152 insertions(+), 149 deletions(-)
 create mode 100755 evaluation/distr_benchmarks/install_all_deps.sh

diff --git a/evaluation/distr_benchmarks/dependency_untangling/input/install-deps.sh b/evaluation/distr_benchmarks/dependency_untangling/input/install-deps.sh
index 3d4a75b1a..3bacbcaff 100755
--- a/evaluation/distr_benchmarks/dependency_untangling/input/install-deps.sh
+++ b/evaluation/distr_benchmarks/dependency_untangling/input/install-deps.sh
@@ -1,35 +1,41 @@
-IN=$PASH_TOP/evaluation/benchmarks/dependency_untangling/input/
+IN=$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/
 mkdir -p ${IN}/deps/
 # install dependencies
-pkgs='ffmpeg unrtf imagemagick libarchive-tools zstd liblzma-dev libbz2-dev zip unzip nodejs tcpdump'
+pkgs='ffmpeg unrtf imagemagick libarchive-tools libncurses5-dev libncursesw5-dev zstd liblzma-dev libbz2-dev zip unzip nodejs tcpdump'
 
 if ! dpkg -s $pkgs >/dev/null 2>&1 ; then
     sudo apt-get install $pkgs -y
     echo 'Packages Installed'
 fi
 
-# NOT used
-# if [ ! -d ${IN}/deps/samtools-1.7 ]; then
-#     cd ${IN}/deps/
-#     wget https://github.com/samtools/samtools/archive/refs/tags/1.7.zip
-#     unzip 1.7.zip
-#     rm 1.7.zip
-#     cd samtools-1.7
-#     wget https://github.com/samtools/htslib/archive/refs/tags/1.7.zip
-#     unzip 1.7.zip
-#     autoheader            # Build config.h.in (this may generate a warning about
-#     # AC_CONFIG_SUBDIRS - please ignore it).
-#     autoconf -Wno-syntax  # Generate the configure script
-#     ./configure           # Needed for choosing optional functionality
-#     make
-#     rm -rf 1.7.zip
-#     echo 'Samtools installed'
-# fi
+if [ ! -d ${IN}/deps/samtools-1.7 ]; then
+    cd ${IN}/deps/
+    wget https://github.com/samtools/samtools/archive/refs/tags/1.7.zip
+    unzip 1.7.zip
+    rm 1.7.zip
+    cd samtools-1.7
+    wget https://github.com/samtools/htslib/archive/refs/tags/1.7.zip
+    unzip 1.7.zip
+    autoheader            # Build config.h.in (this may generate a warning about
+    # AC_CONFIG_SUBDIRS - please ignore it).
+    autoconf -Wno-syntax  # Generate the configure script
+    ./configure           # Needed for choosing optional functionality
+    make
+    rm -rf 1.7.zip
+    echo 'Samtools installed'
+fi
 
-# if [ ! -f ${IN}/deps/makedeb.deb ]; then
+if [ ! -d ${IN}/mir-sa ]; then
+    # download the specific mir version
+    cd ${IN}
+    wget http://pac-n4.csail.mit.edu:81/pash_data/mir-sa.zip
+    unzip mir-sa.zip
+    rm mir-sa.zip
+fi
+
+# if ! dpkg -s "makedeb-makepkg" >/dev/null 2>&1 ; then
 #     cd ${IN}/deps/
 #     wget http://pac-n4.csail.mit.edu:81/pash_data/makedeb.deb
 #     sudo dpkg -i makedeb.deb
 #     echo 'Makedeb installed'
 # fi
-
diff --git a/evaluation/distr_benchmarks/dependency_untangling/input/setup.sh b/evaluation/distr_benchmarks/dependency_untangling/input/setup.sh
index d3baf70ca..8c147e49c 100755
--- a/evaluation/distr_benchmarks/dependency_untangling/input/setup.sh
+++ b/evaluation/distr_benchmarks/dependency_untangling/input/setup.sh
@@ -24,142 +24,136 @@ if [ "$1" == "-c" ]; then
     exit 
 fi
 
-setup_dataset() {
-    hdfs dfs -mkdir /dependency_untangling
+hdfs dfs -mkdir /dependency_untangling
 
-    if [ "$1" == "--small" ]; then
-        LOG_DATA_FILES=6
-        WAV_DATA_FILES=20
-        NODE_MODULE_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/node_modules.zip
-        BIO_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/bio.zip
-        JPG_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/jpg.zip
-        PCAP_DATA_FILES=1
-    else
-        LOG_DATA_FILES=84
-        WAV_DATA_FILES=120
-        NODE_MODULE_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/node_modules.zip
-        BIO_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/bio.zip
-        JPG_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/jpg.zip
-        PCAP_DATA_FILES=15
-    fi
+if [ "$1" == "--small" ]; then
+    LOG_DATA_FILES=6
+    WAV_DATA_FILES=20
+    NODE_MODULE_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/node_modules.zip
+    BIO_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/bio.zip
+    JPG_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/jpg.zip
+    PCAP_DATA_FILES=1
+else
+    LOG_DATA_FILES=84
+    WAV_DATA_FILES=120
+    NODE_MODULE_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/node_modules.zip
+    BIO_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/bio.zip
+    JPG_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/jpg.zip
+    PCAP_DATA_FILES=15
+fi
 
-    if [ ! -d ${IN}/wav ]; then
-        wget http://pac-n4.csail.mit.edu:81/pash_data/wav.zip
-        unzip wav.zip && cd wav/
-        for f in *.wav; do
-            FILE=$(basename "$f")
-            for (( i = 0; i <= $WAV_DATA_FILES; i++)) do
-                echo copying to $f$i.wav
-                cp $f $f$i.wav
-            done
+if [ ! -d ${IN}/wav ]; then
+    wget http://pac-n4.csail.mit.edu:81/pash_data/wav.zip
+    unzip wav.zip && cd wav/
+    for f in *.wav; do
+        FILE=$(basename "$f")
+        for (( i = 0; i <= $WAV_DATA_FILES; i++)) do
+            echo copying to $f$i.wav
+            cp $f $f$i.wav
         done
-        cd ..
-        hdfs dfs -put wav /dependency_untangling/wav
-        echo "WAV Generated"
-    fi
+    done
+    cd ..
+    hdfs dfs -put wav /dependency_untangling/wav
+    echo "WAV Generated"
+fi
 
-    if [ ! -d ${IN}/jpg ]; then
-        cd ${IN}
-        wget $JPG_DATA_LINK
-        unzip jpg.zip
-        hdfs dfs -put jpg /dependency_untangling/jpg
-        echo "JPG Generated"
-        rm -rf ${IN}/jpg.zip
-    fi
+if [ ! -d ${IN}/jpg ]; then
+    cd ${IN}
+    wget $JPG_DATA_LINK
+    unzip jpg.zip
+    hdfs dfs -put jpg /dependency_untangling/jpg
+    echo "JPG Generated"
+    rm -rf ${IN}/jpg.zip
+fi
 
-    # download the input for the nginx logs and populate the dataset
-    if [ ! -d ${IN}/log_data ]; then
-        cd $IN
-        wget http://pac-n4.csail.mit.edu:81/pash_data/nginx.zip
-        unzip nginx.zip 
-        rm nginx.zip
-        # generating analysis logs
-        mkdir -p ${IN}/log_data
-        for (( i = 1; i <=$LOG_DATA_FILES; i++)) do
-            for j in nginx-logs/*;do
-                n=$(basename $j)
-                cat $j > log_data/log${i}_${n}.log; 
-            done
+# download the input for the nginx logs and populate the dataset
+if [ ! -d ${IN}/log_data ]; then
+    cd $IN
+    wget http://pac-n4.csail.mit.edu:81/pash_data/nginx.zip
+    unzip nginx.zip 
+    rm nginx.zip
+    # generating analysis logs
+    mkdir -p ${IN}/log_data
+    for (( i = 1; i <=$LOG_DATA_FILES; i++)) do
+        for j in nginx-logs/*;do
+            n=$(basename $j)
+            cat $j > log_data/log${i}_${n}.log; 
         done
-        hdfs dfs -put log_data /dependency_untangling/log_data
-        echo "Logs Generated"
+    done
+    hdfs dfs -put log_data /dependency_untangling/log_data
+    echo "Logs Generated"
+fi
+
+if [ ! -d ${IN}/bio ]; then                                                  
+    if [ "$1" = "--small" ]; then
+        # download the Genome loc file
+        wget $BIO_DATA_LINK 
+        unzip bio.zip
+        cd bio
+        wget http://pac-n4.csail.mit.edu:81/pash_data/Gene_locs.txt
+        wget http://pac-n4.csail.mit.edu:81/pash_data/small/100G.txt
+        cd ..
+        rm bio.zip
+    else
+        mkdir ${IN}/bio                                                          
+        cd ${IN}/bio                                                             
+        # download the file containing the links for the dataset                 
+        wget http://pac-n4.csail.mit.edu:81/pash_data/100G.txt                   
+        # download the Genome loc file                                           
+        wget http://pac-n4.csail.mit.edu:81/pash_data/Gene_locs.txt              
+        # start downloading the real dataset
+        IN_NAME=$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/bio/100G.txt                                     
+        cat ${IN_NAME} | while read s_line;                                       
+            do                                                                       
+                echo ${IN_NAME}                                                      
+                sample=$(echo $s_line |cut -d " " -f 2);                             
+                if [[ ! -f $sample ]]; then                                          
+                    pop=$(echo $s_line |cut -f 1 -d " ");                            
+                    link=$(echo $s_line |cut -f 3 -d " ");                           
+                    wget -O "$sample".bam  "$link"; ##this part can be adjusted maybe
+                fi                                                                   
+            done;
+        cd ..    
     fi
+    hdfs dfs -put bio /dependency_untangling/bio                                                                           
+    echo "Genome data downloaded"
+fi                                                                           
 
-    # if [ ! -d ${IN}/bio ]; then                                                  
-    #     if [ "$1" = "--small" ]; then
-    #         # download the Genome loc file
-    #         wget $BIO_DATA_LINK 
-    #         unzip bio.zip
-    #         cd bio
-    #         wget http://pac-n4.csail.mit.edu:81/pash_data/Gene_locs.txt
-    #         wget http://pac-n4.csail.mit.edu:81/pash_data/small/100G.txt
-    #         cd ..
-    #         rm bio.zip
-    #     else
-    #         mkdir ${IN}/bio                                                          
-    #         cd ${IN}/bio                                                             
-    #         # download the file containing the links for the dataset                 
-    #         wget http://pac-n4.csail.mit.edu:81/pash_data/100G.txt                   
-    #         # download the Genome loc file                                           
-    #         wget http://pac-n4.csail.mit.edu:81/pash_data/Gene_locs.txt              
-    #         # start downloading the real dataset                                     
-    #         cat ${IN_NAME} |while read s_line;                                       
-    #     do                                                                       
-    #         echo ${IN_NAME}                                                      
-    #         sample=$(echo $s_line |cut -d " " -f 2);                             
-    #         if [[ ! -f $sample ]]; then                                          
-    #             pop=$(echo $s_line |cut -f 1 -d " ");                            
-    #             link=$(echo $s_line |cut -f 3 -d " ");                           
-    #             wget -O "$sample".bam  "$link"; ##this part can be adjusted maybe
-    #         fi                                                                   
-    #     done;    
-    #     fi                                                                           
-    #     echo "Genome data downloaded"
-    # fi                                                                           
-    
-    # download the initial pcaps to populate the whole dataset
-    if [ ! -d ${IN}/pcap_data ]; then
-        cd $IN
-        wget http://pac-n4.csail.mit.edu:81/pash_data/pcaps.zip
-        unzip pcaps.zip
-        rm pcaps.zip
-        mkdir ${IN}/pcap_data/
-        # generates 20G
-        for (( i = 1; i <= $PCAP_DATA_FILES; i++ )) do
-            for j in ${IN}/pcaps/*;do
-                n=$(basename $j)
-                cat $j > pcap_data/pcap${i}_${n}; 
-            done
+# download the initial pcaps to populate the whole dataset
+if [ ! -d ${IN}/pcap_data ]; then
+    cd $IN
+    wget http://pac-n4.csail.mit.edu:81/pash_data/pcaps.zip
+    unzip pcaps.zip
+    rm pcaps.zip
+    mkdir ${IN}/pcap_data/
+    # generates 20G
+    for (( i = 1; i <= $PCAP_DATA_FILES; i++ )) do
+        for j in ${IN}/pcaps/*;do
+            n=$(basename $j)
+            cat $j > pcap_data/pcap${i}_${n}; 
         done
-        hdfs dfs -put pcap_data /dependency_untangling/pcap_data
-        echo "Pcaps Generated"
-    fi 
-    
-    # # download the modules for the Mir static analyses
-    # if [ ! -d ${IN}/node_modules ]; then
-    #     cd $IN
-    #     wget $NODE_MODULE_LINK
-    #     unzip node_modules.zip 
-    #     rm node_modules.zip
-    #     # download the specific mir version
-    #     wget http://pac-n4.csail.mit.edu:81/pash_data/mir-sa.zip
-    #     unzip mir-sa.zip
-    #     rm mir-sa.zip
-    #     echo "Node modules generated"
-    # fi
+    done
+    hdfs dfs -put pcap_data /dependency_untangling/pcap_data
+    echo "Pcaps Generated"
+fi 
 
-    # # download the packages for the package building
-    # if [ ! -f ${IN}/packages ]; then
-    #     cd $IN
-    #     wget http://pac-n4.csail.mit.edu:81/pash_data/packages
-    #     if [ "$1" = "--small" ]; then
-    #         head -n 20 packages > p
-    #         mv p  packages
-    #     fi
-    #     echo "Package datset downloaded"
-    # fi
-}
+# download the modules for the Mir static analyses
+if [ ! -d ${IN}/node_modules ]; then
+    cd $IN
+    wget $NODE_MODULE_LINK
+    unzip node_modules.zip 
+    rm node_modules.zip
+    hdfs dfs -put node_modules /dependency_untangling/node_modules
+    echo "Node modules generated"
+fi
 
-source_var() {
-  export IN=
-}
+# # download the packages for the package building
+# if [ ! -f ${IN}/packages ]; then
+#     cd $IN
+#     wget http://pac-n4.csail.mit.edu:81/pash_data/packages
+#     if [ "$1" = "--small" ]; then
+#         head -n 20 packages > p
+#         mv p  packages
+#     fi
+#     echo "Package datset downloaded"
+# fi
diff --git a/evaluation/distr_benchmarks/install_all_deps.sh b/evaluation/distr_benchmarks/install_all_deps.sh
new file mode 100755
index 000000000..3bf174252
--- /dev/null
+++ b/evaluation/distr_benchmarks/install_all_deps.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+bash ./dependency_untangling/input/install-deps.sh
\ No newline at end of file

From a056c467a47fa1f2cfe59bc433af3a6f5504219b Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Tue, 7 Jun 2022 21:25:19 +0000
Subject: [PATCH 27/37] Add newly added benchmarks to the run all script

---
 evaluation/distr_benchmarks/run_all.sh | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/evaluation/distr_benchmarks/run_all.sh b/evaluation/distr_benchmarks/run_all.sh
index 9162bd352..fdd89e2f9 100755
--- a/evaluation/distr_benchmarks/run_all.sh
+++ b/evaluation/distr_benchmarks/run_all.sh
@@ -1,5 +1,14 @@
 cd $PASH_TOP/evaluation/distr_benchmarks/oneliners
 bash run.distr.sh
 
+cd $PASH_TOP/evaluation/distr_benchmarks/unix50
+bash run.distr.sh
+
 cd $PASH_TOP/evaluation/distr_benchmarks/nlp
+bash run.distr.sh
+
+cd $PASH_TOP/evaluation/distr_benchmarks/analytics-mts
+bash run.distr.sh
+
+cd $PASH_TOP/evaluation/distr_benchmarks/dependency_untangling
 bash run.distr.sh
\ No newline at end of file

From 9ee0791077b0de94683c83f6f542f3b5598048f0 Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Wed, 8 Jun 2022 19:51:05 +0000
Subject: [PATCH 28/37] use gzip instead of zip for better streaming support

---
 .../dependency_untangling/compress_files.sh                | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/evaluation/distr_benchmarks/dependency_untangling/compress_files.sh b/evaluation/distr_benchmarks/dependency_untangling/compress_files.sh
index d7c331e84..63f405099 100755
--- a/evaluation/distr_benchmarks/dependency_untangling/compress_files.sh
+++ b/evaluation/distr_benchmarks/dependency_untangling/compress_files.sh
@@ -4,16 +4,11 @@ IN=${IN:-/dependency_untangling/pcap_data/}
 OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/compress}
 
 mkdir -p ${OUT}
-pure_func() {
-    zip -r --
-}
-
-export -f pure_func
 
 for item in $(hdfs dfs -ls -C ${IN});
 do
     output_name=$(basename $item).zip
-    hdfs dfs -cat $item | pure_func > $OUT/$output_name
+    hdfs dfs -cat $item | gzip -c > $OUT/$output_name
 done
 
 echo 'done';

From 63fe6145111c13c3c0c1821d6b261f5db8a18883 Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Wed, 8 Jun 2022 19:55:04 +0000
Subject: [PATCH 29/37] small changes to setup script

---
 .../distr_benchmarks/nlp/input/setup.sh       | 55 ++++++--------
 .../distr_benchmarks/unix50/input/setup.sh    | 75 +++++++++----------
 2 files changed, 60 insertions(+), 70 deletions(-)

diff --git a/evaluation/distr_benchmarks/nlp/input/setup.sh b/evaluation/distr_benchmarks/nlp/input/setup.sh
index 1875bbb8a..48bdde472 100755
--- a/evaluation/distr_benchmarks/nlp/input/setup.sh
+++ b/evaluation/distr_benchmarks/nlp/input/setup.sh
@@ -4,21 +4,20 @@ PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)}
 
 [[ "$1" == "-c" ]] && { rm -rf genesis exodus pg; exit; }
 
-setup_dataset() {
-  if [ ! -f ./genesis ]; then
-      curl -sf https://www.gutenberg.org/cache/epub/8001/pg8001.txt > genesis
-      "$PASH_TOP/scripts/append_nl_if_not.sh" genesis
-  fi 
+if [ ! -f ./genesis ]; then
+    curl -sf https://www.gutenberg.org/cache/epub/8001/pg8001.txt > genesis
+    "$PASH_TOP/scripts/append_nl_if_not.sh" genesis
+fi 
 
-  if [ ! -f ./exodus ]; then
-    curl -sf https://www.gutenberg.org/files/33420/33420-0.txt > exodus
-    "$PASH_TOP/scripts/append_nl_if_not.sh" exodus
-  fi
+if [ ! -f ./exodus ]; then
+  curl -sf https://www.gutenberg.org/files/33420/33420-0.txt > exodus
+  "$PASH_TOP/scripts/append_nl_if_not.sh" exodus
+fi
 
-  if [ ! -e ./pg ]; then
-    mkdir pg
-    cd pg
-  if [[ "$1" == "--gen-full" ]]; then
+if [ ! -e ./pg ]; then
+  mkdir pg
+  cd pg
+  if [[ "$1" == "--full" ]]; then
     echo 'N.b.: download/extraction will take about 10min'
     wget ndr.md/data/pg.tar.xz
     if [ $? -ne 0 ]; then
@@ -36,24 +35,16 @@ setup_dataset() {
     mv data/* .
     rm nlp.zip data -rf
   fi
-    for f in *.txt; do
-      "$PASH_TOP/scripts/append_nl_if_not.sh" $f
-    done
-    cd ..
-  fi
 
-  # Put files in hdfs
-  hdfs dfs -mkdir /nlp
-  hdfs dfs -put exodus /nlp/exodus
-  hdfs dfs -put genesis /nlp/genesis
-  hdfs dfs -put pg /nlp/pg
-}
+  for f in *.txt; do
+    "$PASH_TOP/scripts/append_nl_if_not.sh" $f
+  done
+  cd ..
+  
+fi
 
-source_var() {
-  if [[ "$1" == "--small" ]]; then
-    export ENTRIES=40
-  else
-    # 1% of the input
-    export ENTRIES=1060
-  fi
-}
+# Put files in hdfs
+hdfs dfs -mkdir /nlp
+hdfs dfs -put exodus /nlp/exodus
+hdfs dfs -put genesis /nlp/genesis
+hdfs dfs -put pg /nlp/pg
diff --git a/evaluation/distr_benchmarks/unix50/input/setup.sh b/evaluation/distr_benchmarks/unix50/input/setup.sh
index 68b831d82..4a7c37dec 100755
--- a/evaluation/distr_benchmarks/unix50/input/setup.sh
+++ b/evaluation/distr_benchmarks/unix50/input/setup.sh
@@ -19,46 +19,45 @@ if [[ "$1" == "-c" ]]; then
     exit
 fi
 
-setup_dataset() {
-    # Put files in hdfs
-    hdfs dfs -mkdir /unix50
-    
-    # generate small inputs 
-    # if [ "$#" -eq 1 ] && [ "$1" = "--small" ]; then
-    #   if [ ! -d ./small ]; then                                                          
-    #     echo "Generating small-size inputs"                                             
-    #     # FIXME PR: Do we need all of them?                                             
-    #     curl -sf 'http://pac-n4.csail.mit.edu:81/pash_data/small/unix50.zip' > unix50.zip
-    #     unzip unix50.zip                                                                 
-    #     rm -f unix50.zip                                                                 
-    #   fi
-    #   hdfs dfs -put small /unix50/small                                                                              
-    #   return 0
-    # fi
-  
-    for input in ${inputs[@]}
-    do
-        if [ ! -f "${input}.txt" ]; then
-            wget "http://ndr.md/data/unix50/${input}.txt"
-            "$PASH_TOP/scripts/append_nl_if_not.sh" "${input}.txt"
-        fi
-        hdfs dfs -put "${input}.txt" /unix50/"${input}.txt"
-    done
+# Put files in hdfs
+hdfs dfs -mkdir /unix50
 
-    # increase the original input size 10x
-    if [ "$#" -eq 1 ] && [ "$1" = "--extended" ]; then
-        EXTENDED_INPUT_DIR="extended_input/"
-        mkdir -p $EXTENDED_INPUT_DIR
-        for file in *.txt; do
-            rm $EXTENDED_INPUT_DIR/$file
-            for (( i = 0; i < 10; i++ )); do
-                cat $file >> $EXTENDED_INPUT_DIR/temp.txt
-            done
-        done
-        hdfs dfs -put $EXTENDED_INPUT_DIR /unix50/$EXTENDED_INPUT_DIR
-        rm -rf $EXTENDED_INPUT_DIR
+# generate small inputs 
+# if [ "$#" -eq 1 ] && [ "$1" = "--small" ]; then
+#   if [ ! -d ./small ]; then                                                          
+#     echo "Generating small-size inputs"                                             
+#     # FIXME PR: Do we need all of them?                                             
+#     curl -sf 'http://pac-n4.csail.mit.edu:81/pash_data/small/unix50.zip' > unix50.zip
+#     unzip unix50.zip                                                                 
+#     rm -f unix50.zip                                                                 
+#   fi
+#   hdfs dfs -put small /unix50/small                                                                              
+#   return 0
+# fi
+
+for input in ${inputs[@]}
+do
+    if [ ! -f "${input}.txt" ]; then
+        wget "http://ndr.md/data/unix50/${input}.txt"
+        "$PASH_TOP/scripts/append_nl_if_not.sh" "${input}.txt"
     fi
-}
+    hdfs dfs -put "${input}.txt" /unix50/"${input}.txt"
+done
+
+# increase the original input size 10x
+if [ "$#" -eq 1 ] && [ "$1" = "--extended" ]; then
+    EXTENDED_INPUT_DIR="extended_input/"
+    mkdir -p $EXTENDED_INPUT_DIR
+    for file in *.txt; do
+        rm $EXTENDED_INPUT_DIR/$file
+        for (( i = 0; i < 10; i++ )); do
+            cat $file >> $EXTENDED_INPUT_DIR/temp.txt
+        done
+    done
+    hdfs dfs -put $EXTENDED_INPUT_DIR /unix50/$EXTENDED_INPUT_DIR
+    rm -rf $EXTENDED_INPUT_DIR
+fi
+
 
 source_var() {
   if [[ "$1" == "--extended" ]]; then

From badea8dc442d485fa8df32c0167d3f43aeff6b31 Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Wed, 8 Jun 2022 19:55:23 +0000
Subject: [PATCH 30/37] fix bug in pcap.sh

---
 evaluation/distr_benchmarks/dependency_untangling/pcap.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/evaluation/distr_benchmarks/dependency_untangling/pcap.sh b/evaluation/distr_benchmarks/dependency_untangling/pcap.sh
index 13a0cd29e..237ab092a 100755
--- a/evaluation/distr_benchmarks/dependency_untangling/pcap.sh
+++ b/evaluation/distr_benchmarks/dependency_untangling/pcap.sh
@@ -6,8 +6,8 @@ mkdir -p $OUT
 
 pure_func() {
     tempfile=$(mktemp)
-
-    tee $tempfile | tcpdump -nn -r '-' -A 'port 53' 2> /dev/null | sort | uniq |grep -Ev '(com|net|org|gov|mil|arpa)' 2> /dev/null
+    cat > $tempfile 
+    tcpdump -nn -r $tempfile -A 'port 53' 2> /dev/null | sort | uniq |grep -Ev '(com|net|org|gov|mil|arpa)' 2> /dev/null
     # extract URL
     tcpdump -nn -r $tempfile -s 0 -v -n -l 2> /dev/null | egrep -i "POST /|GET /|Host:" 2> /dev/null
     # extract passwords

From bb3f2e01c2c44efe1e4b2210071ce4c0ec7526c0 Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Sat, 11 Jun 2022 18:32:33 +0000
Subject: [PATCH 31/37] Add max-temp benchmark

---
 .../distr_benchmarks/max-temp/input/setup.sh  | 31 +++++++++++
 .../distr_benchmarks/max-temp/run.distr.sh    | 53 +++++++++++++++++++
 .../max-temp/temp-analytics.sh                | 20 +++++++
 3 files changed, 104 insertions(+)
 create mode 100755 evaluation/distr_benchmarks/max-temp/input/setup.sh
 create mode 100755 evaluation/distr_benchmarks/max-temp/run.distr.sh
 create mode 100755 evaluation/distr_benchmarks/max-temp/temp-analytics.sh

diff --git a/evaluation/distr_benchmarks/max-temp/input/setup.sh b/evaluation/distr_benchmarks/max-temp/input/setup.sh
new file mode 100755
index 000000000..ffb5f47b4
--- /dev/null
+++ b/evaluation/distr_benchmarks/max-temp/input/setup.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+FROM=${FROM:-2015}
+TO=${TO:-2015}
+IN=${IN:-'http://ndr.md/data/noaa/'}
+fetch=${fetch:-"curl -s"}
+
+data_file=temperatures.txt
+
+if [[ "$1" == "--extended" ]]; then
+  echo "Downloading extended input"
+  dataset_size=14418
+else
+  dataset_size=1442
+fi
+
+## Downloading and extracting
+seq $FROM $TO |
+  sed "s;^;$IN;" |
+  sed 's;$;/;' |
+  xargs -r -n 1 $fetch |
+  grep gz |
+  tr -s ' \n' |
+  cut -d ' ' -f9 |
+  sed 's;^\(.*\)\(20[0-9][0-9]\).gz;\2/\1\2\.gz;' |
+  sed "s;^;$IN;" |
+  head -n $dataset_size |
+  xargs -n1 $fetch |
+  gunzip > "${data_file}"
+
+hdfs dfs -mkdir /max-temp
+hdfs dfs -put "${data_file}" /max-temp/"${data_file}"
\ No newline at end of file
diff --git a/evaluation/distr_benchmarks/max-temp/run.distr.sh b/evaluation/distr_benchmarks/max-temp/run.distr.sh
new file mode 100755
index 000000000..3114fb3b9
--- /dev/null
+++ b/evaluation/distr_benchmarks/max-temp/run.distr.sh
@@ -0,0 +1,53 @@
+PASH_FLAGS='--width 8 --r_split'
+export TIMEFORMAT=%R
+
+export IN="/max-temp/temperatures.txt"
+
+max-temp_bash(){
+  times_file="seq.res"
+  outputs_suffix="seq.out"
+  outputs_dir="outputs"
+ 
+  mkdir -p "$outputs_dir"
+  touch "$times_file"
+  echo executing max temp $(date) | tee -a "$times_file"
+  outputs_file="${outputs_dir}/temp-analytics.${outputs_suffix}"
+  echo "max-temp.sh: " $({ time ./temp-analytics.sh > "${outputs_file}"; } 2>&1) | tee -a "$times_file"
+}
+
+max-temp_pash(){
+  flags=${1:-$PASH_FLAGS}
+  prefix=${2:-par}
+
+  times_file="$prefix.res"
+  outputs_suffix="$prefix.out"
+  time_suffix="$prefix.time"
+  outputs_dir="outputs"
+  pash_logs_dir="pash_logs_$prefix"
+ 
+  mkdir -p "$outputs_dir"
+  mkdir -p "$pash_logs_dir"
+  
+  touch "$times_file"
+  cat "$times_file" >> "$times_file".d
+  echo executing max-temp with $prefix pash $(date) | tee "$times_file"
+  echo '' >> "$times_file"
+
+  outputs_file="${outputs_dir}/temp-analytics.${outputs_suffix}"
+  pash_log="${pash_logs_dir}/temp-analytics.pash.log"
+  single_time_file="${outputs_dir}/temp-analytics.${time_suffix}"
+  
+  echo -n "temp-analytics.sh:  " | tee -a "$times_file"
+  { time "$PASH_TOP/pa.sh" $flags --log_file "${pash_log}" temp-analytics.sh > "$outputs_file"; } 2> "${single_time_file}"
+  cat "${single_time_file}" | tee -a "$times_file"
+}
+
+max-temp_bash
+
+max-temp_pash "$PASH_FLAGS" "par_no_du"
+
+max-temp_pash "$PASH_FLAGS --parallel_pipelines --parallel_pipelines_limit 24" "par"
+
+max-temp_pash "$PASH_FLAGS --distributed_exec" "distr_no_du"
+
+max-temp_pash "$PASH_FLAGS --parallel_pipelines --distributed_exec --parallel_pipelines_limit 24" "distr"
diff --git a/evaluation/distr_benchmarks/max-temp/temp-analytics.sh b/evaluation/distr_benchmarks/max-temp/temp-analytics.sh
new file mode 100755
index 000000000..8ab2113d8
--- /dev/null
+++ b/evaluation/distr_benchmarks/max-temp/temp-analytics.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+IN=${IN:-/max-temp/temperatures.txt}
+
+## Processing
+hdfs dfs -cat "${IN}" |
+  cut -c 89-92 |
+  grep -v 999 |
+  sort -rn |
+  head -n1 > max.txt
+
+hdfs dfs -cat "${IN}" |
+  cut -c 89-92 |
+  grep -v 999 |
+  sort -n |
+  head -n1 > min.txt
+
+hdfs dfs -cat "${IN}" |
+  cut -c 89-92 |
+  grep -v 999 |
+  awk "{ total += \$1; count++ } END { print total/count }" > average.txt 

From 8124626dd7c743d86d25ec99bcf68d53b904cfb9 Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Sat, 11 Jun 2022 18:35:08 +0000
Subject: [PATCH 32/37] fix typo

---
 evaluation/distr_benchmarks/max-temp/run.distr.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evaluation/distr_benchmarks/max-temp/run.distr.sh b/evaluation/distr_benchmarks/max-temp/run.distr.sh
index 3114fb3b9..bdfb10943 100755
--- a/evaluation/distr_benchmarks/max-temp/run.distr.sh
+++ b/evaluation/distr_benchmarks/max-temp/run.distr.sh
@@ -12,7 +12,7 @@ max-temp_bash(){
   touch "$times_file"
   echo executing max temp $(date) | tee -a "$times_file"
   outputs_file="${outputs_dir}/temp-analytics.${outputs_suffix}"
-  echo "max-temp.sh: " $({ time ./temp-analytics.sh > "${outputs_file}"; } 2>&1) | tee -a "$times_file"
+  echo "temp-analytics.sh: " $({ time ./temp-analytics.sh > "${outputs_file}"; } 2>&1) | tee -a "$times_file"
 }
 
 max-temp_pash(){

From 732c78c4bfe4b37474be684d338a26d83ed3d019 Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Sat, 11 Jun 2022 18:45:21 +0000
Subject: [PATCH 33/37] fixes to eval scripts

---
 .../distr_benchmarks/nlp/input/setup.sh       | 29 +++++++++++++++++--
 evaluation/distr_benchmarks/nlp/run.distr.sh  | 12 ++++----
 .../distr_benchmarks/oneliners/run.distr.sh   |  4 +--
 evaluation/distr_benchmarks/run_all.sh        |  5 +++-
 .../distr_benchmarks/unix50/run.distr.sh      | 15 ++++++----
 5 files changed, 49 insertions(+), 16 deletions(-)

diff --git a/evaluation/distr_benchmarks/nlp/input/setup.sh b/evaluation/distr_benchmarks/nlp/input/setup.sh
index 48bdde472..2f2e7462b 100755
--- a/evaluation/distr_benchmarks/nlp/input/setup.sh
+++ b/evaluation/distr_benchmarks/nlp/input/setup.sh
@@ -5,9 +5,15 @@ PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)}
 [[ "$1" == "-c" ]] && { rm -rf genesis exodus pg; exit; }
 
 if [ ! -f ./genesis ]; then
+<<<<<<< Updated upstream
     curl -sf https://www.gutenberg.org/cache/epub/8001/pg8001.txt > genesis
     "$PASH_TOP/scripts/append_nl_if_not.sh" genesis
 fi 
+=======
+      curl -sf https://www.gutenberg.org/cache/epub/8001/pg8001.txt > genesis
+      "$PASH_TOP/scripts/append_nl_if_not.sh" genesis
+  fi 
+>>>>>>> Stashed changes
 
 if [ ! -f ./exodus ]; then
   curl -sf https://www.gutenberg.org/files/33420/33420-0.txt > exodus
@@ -17,7 +23,11 @@ fi
 if [ ! -e ./pg ]; then
   mkdir pg
   cd pg
+<<<<<<< Updated upstream
   if [[ "$1" == "--full" ]]; then
+=======
+  if [[ "$1" == "--gen-full" ]]; then
+>>>>>>> Stashed changes
     echo 'N.b.: download/extraction will take about 10min'
     wget ndr.md/data/pg.tar.xz
     if [ $? -ne 0 ]; then
@@ -27,14 +37,16 @@ if [ ! -e ./pg ]; then
 		please contact the pash developers pash-devs@googlegroups.com
 		EOF
     exit 1
-    fi
-    cat pg.tar.xz | tar -xJ
+  fi
+  cat pg.tar.xz | tar -xJ
+  
   else
     wget http://pac-n4.csail.mit.edu:81/pash_data/nlp.zip
     unzip nlp.zip
     mv data/* .
     rm nlp.zip data -rf
   fi
+<<<<<<< Updated upstream
 
   for f in *.txt; do
     "$PASH_TOP/scripts/append_nl_if_not.sh" $f
@@ -48,3 +60,16 @@ hdfs dfs -mkdir /nlp
 hdfs dfs -put exodus /nlp/exodus
 hdfs dfs -put genesis /nlp/genesis
 hdfs dfs -put pg /nlp/pg
+=======
+for f in *.txt; do
+  "$PASH_TOP/scripts/append_nl_if_not.sh" $f
+done
+  cd ..
+fi
+
+  # Put files in hdfs
+  hdfs dfs -mkdir /nlp
+  hdfs dfs -put exodus /nlp/exodus
+  hdfs dfs -put genesis /nlp/genesis
+  hdfs dfs -put pg /nlp/pg
+>>>>>>> Stashed changes
diff --git a/evaluation/distr_benchmarks/nlp/run.distr.sh b/evaluation/distr_benchmarks/nlp/run.distr.sh
index a77c00346..8c0488714 100755
--- a/evaluation/distr_benchmarks/nlp/run.distr.sh
+++ b/evaluation/distr_benchmarks/nlp/run.distr.sh
@@ -52,9 +52,9 @@ nlp_bash(){
     IFS=";" read -r -a name_script_parsed <<< "${name_script}"
     name="${name_script_parsed[0]}"
     script="${name_script_parsed[1]}"
-    printf -v pad %30s
-    padded_script="${name}.sh:${pad}"
-    padded_script=${padded_script:0:30}
+    printf -v pad %40s
+    padded_script="${name}.sh: ${pad}"
+    padded_script=${padded_script:0:40}
 
     outputs_file="${outputs_dir}/${script}.${outputs_suffix}"
 
@@ -85,9 +85,9 @@ nlp_pash(){
     IFS=";" read -r -a name_script_parsed <<< "${name_script}"
     name="${name_script_parsed[0]}"
     script="${name_script_parsed[1]}"
-    printf -v pad %30s
-    padded_script="${name}.sh:${pad}"
-    padded_script=${padded_script:0:30}
+    printf -v pad %40s
+    padded_script="${name}.sh: ${pad}"
+    padded_script=${padded_script:0:40}
 
     outputs_file="${outputs_dir}/${script}.${outputs_suffix}"
     pash_log="${pash_logs_dir}/${script}.pash.log"
diff --git a/evaluation/distr_benchmarks/oneliners/run.distr.sh b/evaluation/distr_benchmarks/oneliners/run.distr.sh
index 95adff56b..948a61b48 100755
--- a/evaluation/distr_benchmarks/oneliners/run.distr.sh
+++ b/evaluation/distr_benchmarks/oneliners/run.distr.sh
@@ -39,7 +39,7 @@ oneliners_bash() {
     export dict=
 
     printf -v pad %30s
-    padded_script="${script}${pad}"
+    padded_script="${script}.sh:${pad}"
     padded_script=${padded_script:0:30}
 
     seq_outputs_file="${outputs_dir}/${script}.${seq_outputs_suffix}"
@@ -77,7 +77,7 @@ oneliners_pash(){
     export dict=
 
     printf -v pad %30s
-    padded_script="${script}${pad}"
+    padded_script="${script}.sh:${pad}"
     padded_script=${padded_script:0:30}
 
     outputs_file="${outputs_dir}/${script}.${outputs_suffix}"
diff --git a/evaluation/distr_benchmarks/run_all.sh b/evaluation/distr_benchmarks/run_all.sh
index fdd89e2f9..4e5934595 100755
--- a/evaluation/distr_benchmarks/run_all.sh
+++ b/evaluation/distr_benchmarks/run_all.sh
@@ -11,4 +11,7 @@ cd $PASH_TOP/evaluation/distr_benchmarks/analytics-mts
 bash run.distr.sh
 
 cd $PASH_TOP/evaluation/distr_benchmarks/dependency_untangling
-bash run.distr.sh
\ No newline at end of file
+bash run.distr.sh
+
+cd $PASH_TOP/evaluation/distr_benchmarks/max-temp
+bash run.distr.sh
diff --git a/evaluation/distr_benchmarks/unix50/run.distr.sh b/evaluation/distr_benchmarks/unix50/run.distr.sh
index c4dd9149d..1e10f8b6c 100755
--- a/evaluation/distr_benchmarks/unix50/run.distr.sh
+++ b/evaluation/distr_benchmarks/unix50/run.distr.sh
@@ -1,6 +1,11 @@
 PASH_FLAGS='--width 8 --r_split'
 export TIMEFORMAT=%R
-
+names_scripts=(
+    1 2 3 4 5 6 7 8 9 10
+    11 12 13 14 15 16 17 18 19 20 
+    21 23 24 25 26 28 29
+    30 31 32 33 34 35 36
+  )
 if [[ "$1" == "--extended" ]]; then
     echo "Using extended input"
     export IN_PRE=/unix50/extended_input
@@ -20,7 +25,7 @@ unix50_bash(){
   echo executing Unix50 $(date) | tee "$times_file"
   echo '' >> "$times_file"
 
-  for number in `seq 36`
+  for number in ${names_scripts[@]}
   do
     script="${number}"
     
@@ -30,7 +35,7 @@ unix50_bash(){
 
     outputs_file="${outputs_dir}/${script}.${outputs_suffix}"
 
-    echo "${padded_script}" $({ time ./${script}.sh > "$outputs_file"; } 2>&1) | tee -a "$times_file"
+    echo "${padded_script}" $({ time ./${script}.sh 2> /dev/null > "$outputs_file"; } 2>&1) | tee -a "$times_file"
   done
 }
 
@@ -53,7 +58,7 @@ unix50_pash(){
   echo executing Unix50 $(date) | tee "$times_file"
   echo '' >> "$times_file"
 
-  for number in `seq 36`
+  for number in ${names_scripts[@]}
   do
     script="${number}"
     
@@ -66,7 +71,7 @@ unix50_pash(){
     single_time_file="${outputs_dir}/${script}.${time_suffix}"
 
     echo -n "${padded_script}" | tee -a "$times_file"
-    { time "$PASH_TOP/pa.sh" $flags --log_file "${pash_log}" ${script}.sh > "$outputs_file"; } 2> "${single_time_file}"
+    { time "$PASH_TOP/pa.sh" $flags --log_file "${pash_log}" ${script}.sh 2> /dev/null > "$outputs_file"; } 2> "${single_time_file}"
     cat "${single_time_file}" | tee -a "$times_file"
   done  
 }

From 55aee240b0fb7713a2c2d0f67edf1ce79d796306 Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Sun, 12 Jun 2022 03:12:30 +0000
Subject: [PATCH 34/37] small bug

---
 evaluation/distr_benchmarks/max-temp/run.distr.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/evaluation/distr_benchmarks/max-temp/run.distr.sh b/evaluation/distr_benchmarks/max-temp/run.distr.sh
index bdfb10943..7d43a3532 100755
--- a/evaluation/distr_benchmarks/max-temp/run.distr.sh
+++ b/evaluation/distr_benchmarks/max-temp/run.distr.sh
@@ -10,7 +10,8 @@ max-temp_bash(){
  
   mkdir -p "$outputs_dir"
   touch "$times_file"
-  echo executing max temp $(date) | tee -a "$times_file"
+  cat "$times_file" >> "$times_file".d
+  echo executing max temp $(date) | tee "$times_file"
   outputs_file="${outputs_dir}/temp-analytics.${outputs_suffix}"
   echo "temp-analytics.sh: " $({ time ./temp-analytics.sh > "${outputs_file}"; } 2>&1) | tee -a "$times_file"
 }

From e22dd836ad22aab5707a0e38a4f1a563decbfe3e Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Wed, 15 Jun 2022 02:49:02 +0000
Subject: [PATCH 35/37] fix small issues

---
 evaluation/distr_benchmarks/nlp/4_3.sh   | 1 -
 evaluation/distr_benchmarks/nlp/8.3_3.sh | 7 ++++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/evaluation/distr_benchmarks/nlp/4_3.sh b/evaluation/distr_benchmarks/nlp/4_3.sh
index e817e36b8..3e2d98aef 100755
--- a/evaluation/distr_benchmarks/nlp/4_3.sh
+++ b/evaluation/distr_benchmarks/nlp/4_3.sh
@@ -7,7 +7,6 @@ IN=${IN:-/nlp/pg/}
 OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/4_3/}
 ENTRIES=${ENTRIES:-1060}
 mkdir -p "$OUT"
-echo $ENTRIES
 
 pure_func() {
     input=$1
diff --git a/evaluation/distr_benchmarks/nlp/8.3_3.sh b/evaluation/distr_benchmarks/nlp/8.3_3.sh
index 937522b3f..b0df13b9e 100755
--- a/evaluation/distr_benchmarks/nlp/8.3_3.sh
+++ b/evaluation/distr_benchmarks/nlp/8.3_3.sh
@@ -2,7 +2,7 @@
 # tag: compare_exodus_genesis.sh
 # set -e
 
-IN=${IN:-/nlp/pg/}
+IN=${IN:-/nlp/pg}
 INPUT2=${INPUT2:-/nlp/exodus}
 OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/8.3_3/}
 ENTRIES=${ENTRIES:-1060}
@@ -10,9 +10,10 @@ mkdir -p $OUT
 
 pure_func() {
     input=$1
+    input2=$2
     TEMPDIR=$(mktemp -d)
     cat > ${TEMPDIR}/${input}1.types
-    hdfs dfs -cat  ${INPUT2} | tr -sc '[A-Z][a-z]' '[\012*]' | sort -u > ${TEMPDIR}/${input}2.types
+    hdfs dfs -cat  ${input2} | tr -sc '[A-Z][a-z]' '[\012*]' | sort -u > ${TEMPDIR}/${input}2.types
     sort ${TEMPDIR}/${input}1.types ${TEMPDIR}/${input}2.types ${TEMPDIR}/${input}2.types | uniq -c | head 
     rm -rf ${TEMPDIR}
 }
@@ -20,7 +21,7 @@ export -f pure_func
 
 for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1)
 do
-    hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | sort -u | pure_func $input > ${OUT}/${input}.out
+    hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | sort -u | pure_func $input $INPUT2 > ${OUT}/${input}.out
 done
 
 echo 'done';

From 6fc191d9d53e6ba30f993dc5e4713ff9c74023d9 Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Thu, 16 Jun 2022 01:43:34 +0000
Subject: [PATCH 36/37] change bigrams to be consistant and add hdfs put
 annotation

---
 annotations/hdfs.json                             | 11 +++++++++++
 evaluation/distr_benchmarks/oneliners/bi-grams.sh |  5 +++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/annotations/hdfs.json b/annotations/hdfs.json
index 60c18da71..1796fa4fd 100644
--- a/annotations/hdfs.json
+++ b/annotations/hdfs.json
@@ -13,6 +13,17 @@
             "outputs": ["stdout"],
 			"comments": "This represents hdfs dfs -cat <path>. Slightly hacky since we only check for -cat"
         },
+        {
+            "predicate":
+            {
+                "operator": "exists",
+                "operands": ["-put"]
+            },
+            "class": "pure",
+            "inputs": ["stdin"],
+            "outputs": ["stdout"],
+			"comments": "Ideally we would use stdin-hyphen but unfortunatly hdfs put deadlocks on fifo"
+        },
         {
             "predicate": "default",
             "class": "side-effectful",
diff --git a/evaluation/distr_benchmarks/oneliners/bi-grams.sh b/evaluation/distr_benchmarks/oneliners/bi-grams.sh
index a081a05ec..460f565d3 100755
--- a/evaluation/distr_benchmarks/oneliners/bi-grams.sh
+++ b/evaluation/distr_benchmarks/oneliners/bi-grams.sh
@@ -1,12 +1,13 @@
 #!/bin/bash
 # Find all 2-grams in a piece of text
 
-IN=${IN:-/1G.txt}
+IN=${IN:-/oneliners/1G.txt}
 
 . bi-gram.aux.sh
 
 hdfs dfs -cat $IN |
-  tr -cs A-Za-z '\n' |
+  tr -c 'A-Za-z' '[\n*]' | 
+  grep -v "^\s*$" |
   tr A-Z a-z |
   bigrams_aux |
   sort |

From 26ce8670248bdcd4bc6a1a03a77fe88379a50a98 Mon Sep 17 00:00:00 2001
From: Tammam Mustafa <tammammusatafa@gmail.com>
Date: Thu, 16 Jun 2022 02:17:37 +0000
Subject: [PATCH 37/37] fix leftover merge conflict

---
 .../distr_benchmarks/nlp/input/setup.sh       | 24 -------------------
 1 file changed, 24 deletions(-)

diff --git a/evaluation/distr_benchmarks/nlp/input/setup.sh b/evaluation/distr_benchmarks/nlp/input/setup.sh
index 2f2e7462b..e523d21a8 100755
--- a/evaluation/distr_benchmarks/nlp/input/setup.sh
+++ b/evaluation/distr_benchmarks/nlp/input/setup.sh
@@ -5,15 +5,9 @@ PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)}
 [[ "$1" == "-c" ]] && { rm -rf genesis exodus pg; exit; }
 
 if [ ! -f ./genesis ]; then
-<<<<<<< Updated upstream
     curl -sf https://www.gutenberg.org/cache/epub/8001/pg8001.txt > genesis
     "$PASH_TOP/scripts/append_nl_if_not.sh" genesis
 fi 
-=======
-      curl -sf https://www.gutenberg.org/cache/epub/8001/pg8001.txt > genesis
-      "$PASH_TOP/scripts/append_nl_if_not.sh" genesis
-  fi 
->>>>>>> Stashed changes
 
 if [ ! -f ./exodus ]; then
   curl -sf https://www.gutenberg.org/files/33420/33420-0.txt > exodus
@@ -23,11 +17,7 @@ fi
 if [ ! -e ./pg ]; then
   mkdir pg
   cd pg
-<<<<<<< Updated upstream
   if [[ "$1" == "--full" ]]; then
-=======
-  if [[ "$1" == "--gen-full" ]]; then
->>>>>>> Stashed changes
     echo 'N.b.: download/extraction will take about 10min'
     wget ndr.md/data/pg.tar.xz
     if [ $? -ne 0 ]; then
@@ -46,7 +36,6 @@ if [ ! -e ./pg ]; then
     mv data/* .
     rm nlp.zip data -rf
   fi
-<<<<<<< Updated upstream
 
   for f in *.txt; do
     "$PASH_TOP/scripts/append_nl_if_not.sh" $f
@@ -60,16 +49,3 @@ hdfs dfs -mkdir /nlp
 hdfs dfs -put exodus /nlp/exodus
 hdfs dfs -put genesis /nlp/genesis
 hdfs dfs -put pg /nlp/pg
-=======
-for f in *.txt; do
-  "$PASH_TOP/scripts/append_nl_if_not.sh" $f
-done
-  cd ..
-fi
-
-  # Put files in hdfs
-  hdfs dfs -mkdir /nlp
-  hdfs dfs -put exodus /nlp/exodus
-  hdfs dfs -put genesis /nlp/genesis
-  hdfs dfs -put pg /nlp/pg
->>>>>>> Stashed changes