From 229c91c073a7d7f1591ee3c958bc7347ab9daa33 Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Tue, 26 Apr 2022 13:41:37 +0000 Subject: [PATCH 01/37] porting some of the for loops to hdfs --- .../dependency_untangling/compress_files.sh | 21 +++ .../dependency_untangling/encrypt_files.sh | 20 +++ .../dependency_untangling/genomics.sh | 42 +++++ .../dependency_untangling/img_convert.sh | 12 ++ .../input/install-deps.sh | 34 ++++ .../dependency_untangling/input/packages | 150 ++++++++++++++++ .../dependency_untangling/input/setup.sh | 160 ++++++++++++++++++ .../dependency_untangling/nginx.sh | 36 ++++ .../dependency_untangling/pacaur.sh | 42 +++++ .../dependency_untangling/pcap.sh | 25 +++ .../dependency_untangling/proginf.sh | 18 ++ .../dependency_untangling/run.distr.sh | 90 ++++++++++ .../dependency_untangling/to_mp3.sh | 21 +++ 13 files changed, 671 insertions(+) create mode 100755 evaluation/distr_benchmarks/dependency_untangling/compress_files.sh create mode 100755 evaluation/distr_benchmarks/dependency_untangling/encrypt_files.sh create mode 100755 evaluation/distr_benchmarks/dependency_untangling/genomics.sh create mode 100755 evaluation/distr_benchmarks/dependency_untangling/img_convert.sh create mode 100755 evaluation/distr_benchmarks/dependency_untangling/input/install-deps.sh create mode 100644 evaluation/distr_benchmarks/dependency_untangling/input/packages create mode 100755 evaluation/distr_benchmarks/dependency_untangling/input/setup.sh create mode 100755 evaluation/distr_benchmarks/dependency_untangling/nginx.sh create mode 100755 evaluation/distr_benchmarks/dependency_untangling/pacaur.sh create mode 100755 evaluation/distr_benchmarks/dependency_untangling/pcap.sh create mode 100755 evaluation/distr_benchmarks/dependency_untangling/proginf.sh create mode 100755 evaluation/distr_benchmarks/dependency_untangling/run.distr.sh create mode 100755 evaluation/distr_benchmarks/dependency_untangling/to_mp3.sh diff --git a/evaluation/distr_benchmarks/dependency_untangling/compress_files.sh b/evaluation/distr_benchmarks/dependency_untangling/compress_files.sh new file mode 100755 index 000000000..652ce1969 --- /dev/null +++ b/evaluation/distr_benchmarks/dependency_untangling/compress_files.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# compress all files in a directory +IN=${IN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/pcap_data/} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/compress} +LOGS=${OUT}/logs +mkdir -p ${OUT}/logs +run_tests() { + name=$(basename $1).zip + zip -r ${OUT}/$name $1 +} + +export -f run_tests + +pkg_count=0 +for item in ${IN}/*; +do + pkg_count=$((pkg_count + 1)); + run_tests $item > "${LOGS}"/"$pkg_count.log" +done + +echo 'done'; diff --git a/evaluation/distr_benchmarks/dependency_untangling/encrypt_files.sh b/evaluation/distr_benchmarks/dependency_untangling/encrypt_files.sh new file mode 100755 index 000000000..421732513 --- /dev/null +++ b/evaluation/distr_benchmarks/dependency_untangling/encrypt_files.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# encrypt all files in a directory +IN=${IN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/pcap_data} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/encrypt} +LOGS=${OUT}/logs +mkdir -p ${LOGS} +run_tests() { + openssl enc -aes-256-cbc -pbkdf2 -iter 20000 -in $1 -out $OUT/$(basename $1).enc -k 'key' +} + +export -f run_tests +pkg_count=0 + +for item in ${IN}/*; +do + pkg_count=$((pkg_count + 1)); + run_tests $item > ${LOGS}/${pkg_count}.log +done + +echo 'done'; diff --git a/evaluation/distr_benchmarks/dependency_untangling/genomics.sh b/evaluation/distr_benchmarks/dependency_untangling/genomics.sh new file mode 100755 index 000000000..e5af9c9b9 --- /dev/null +++ b/evaluation/distr_benchmarks/dependency_untangling/genomics.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# create bam files with regions +################### 1KG SAMPLES +IN=${IN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input} +SAMTOOLS_BIN=${IN}/deps/samtools-1.7/samtools +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/bio} +LOGS=${OUT}/logs +IN_NAME=${IN}/bio/100G.txt +GENE_LOCS=${IN}/bio/Gene_locs.txt +mkdir -p ${LOGS} +run_tests() { + s_line=$(echo $1 | tr '@' ' ') + pop=$(echo $s_line |cut -f 1 -d " "); + sample=$(echo $s_line |cut -d " " -f 2); + link=$(echo $s_line |cut -f 3 -d " "); + ### correcting labeling of chromosomes so that all are 1,2,3.. instead of chr1,chr2 or chromosome1 etc + echo 'Processing Sample '${IN}/bio/$sample' '; + # uniform the chromosomes in the file due to inconsistencies + $SAMTOOLS_BIN view -H "${IN}/bio/$sample".bam | sed -e 's/SN:\([0-9XY]\)/SN:chr\1/' -e 's/SN:MT/SN:chrM/' \ + | $SAMTOOLS_BIN reheader - "${IN}/bio/$sample".bam > "${OUT}/$sample"_corrected.bam 2> /dev/null + # create bai file + $SAMTOOLS_BIN index -b "${OUT}/$sample"_corrected.bam 2> /dev/null + ### Isolating each relevant chromosome based on Gen_locs + cut -f 2 ${IN}/bio/Gene_locs.txt |sort |uniq |while read chr; + do + echo 'Isolating Chromosome '$chr' from sample '${OUT}/$sample', '; + $SAMTOOLS_BIN view -b "${OUT}/$sample"_corrected.bam chr"$chr" > "${OUT}/$pop"_"$sample"_"$chr".bam 2> /dev/null + echo 'Indexing Sample '$pop'_'${OUT}/$sample' '; + $SAMTOOLS_BIN index -b "${OUT}/$pop"_"$sample"_"$chr".bam 2> /dev/null + done; +} + +export -f run_tests +data=$(cat ${IN_NAME} | tr ' ' '@') +pkg_count=0 +for item in $data; +do + pkg_count=$((pkg_count + 1)); + run_tests $item > "${LOGS}"/"${pkg_count}.log" +done + +echo 'done'; diff --git a/evaluation/distr_benchmarks/dependency_untangling/img_convert.sh b/evaluation/distr_benchmarks/dependency_untangling/img_convert.sh new file mode 100755 index 000000000..2b87d0528 --- /dev/null +++ b/evaluation/distr_benchmarks/dependency_untangling/img_convert.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# tag: resize image +IN=${JPG:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/jpg} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/jpg} +mkdir -p ${OUT} +for i in $IN/*.jpg; +do + out=$OUT/$(basename -- $i) + convert -resize 70% "$i" "$out"; +done + +echo 'done'; diff --git a/evaluation/distr_benchmarks/dependency_untangling/input/install-deps.sh b/evaluation/distr_benchmarks/dependency_untangling/input/install-deps.sh new file mode 100755 index 000000000..4cb9e845a --- /dev/null +++ b/evaluation/distr_benchmarks/dependency_untangling/input/install-deps.sh @@ -0,0 +1,34 @@ +IN=$PASH_TOP/evaluation/benchmarks/dependency_untangling/input/ +mkdir -p ${IN}/deps/ +# install dependencies +pkgs='ffmpeg unrtf imagemagick libarchive-tools zstd liblzma-dev libbz2-dev zip unzip nodejs tcpdump' + +if ! dpkg -s $pkgs >/dev/null 2>&1 ; then + sudo apt-get install $pkgs -y + echo 'Packages Installed' +fi + +if [ ! -d ${IN}/deps/samtools-1.7 ]; then + cd ${IN}/deps/ + wget https://github.com/samtools/samtools/archive/refs/tags/1.7.zip + unzip 1.7.zip + rm 1.7.zip + cd samtools-1.7 + wget https://github.com/samtools/htslib/archive/refs/tags/1.7.zip + unzip 1.7.zip + autoheader # Build config.h.in (this may generate a warning about + # AC_CONFIG_SUBDIRS - please ignore it). + autoconf -Wno-syntax # Generate the configure script + ./configure # Needed for choosing optional functionality + make + rm -rf 1.7.zip + echo 'Samtools installed' +fi + +if [ ! -f ${IN}/deps/makedeb.deb ]; then + cd ${IN}/deps/ + wget http://pac-n4.csail.mit.edu:81/pash_data/makedeb.deb + sudo dpkg -i makedeb.deb + echo 'Makedeb installed' +fi + diff --git a/evaluation/distr_benchmarks/dependency_untangling/input/packages b/evaluation/distr_benchmarks/dependency_untangling/input/packages new file mode 100644 index 000000000..3d2fb08a6 --- /dev/null +++ b/evaluation/distr_benchmarks/dependency_untangling/input/packages @@ -0,0 +1,150 @@ +w3watch +cant +jzip +zork1 +zork2 +zork3 +atari-adventure +eclipse-subclipse +wallpaper-lightning +squirrelmail +atari-bowling +atari-breakout +atari-combat +atari-space-invaders +cdm-git +gtk-gnutella +roundcubemail-plugin-chbox +roundcubemail-plugin-jquery-mobile +roundcubemail-plugin-mobile +eclipse-svnkit +eclipse-dltk-core +eclipse-dltk-javascript +eclipse-antlr-runtime +eclipse-dltk-shelled +eclipse-linuxtools +eclipse-dltk-python +eclipse-antlr4-runtime +eclipse-jsonedit +eclipse-goclipse +adwaita-dark-darose +roundcubemail-plugin-keyboard-shortcuts-ng +refind-theme-tux-git +refind-theme-metro-git +ggmud-svn +libiriverdb +griver +fsv2 +vecx-git +lib32-glib +lib32-gtk +qjoypad +yumbootstrap-git +nesasm-git +yum-metadata-parser +libretro-fmsx-git +projectm-git +papu-vst-git +rp2a03-vst-git +sn76489-vst-git +ggmud +bpm-git +kodi-addon-vfs-rar +eduke32-git +voidsw-git +kodi-addon-vfs-rar-git +rottexpr-shareware-git +bubblemon +gno3dtet +tutka +netpanzer +stratagus +scourge +lives +drqueue +cytadela +bitefusion +globs-svn +ri-li +globs-benchmarks-svn +dunelegacy +eternallands-sound +getlive +shadermaker +csl +brother-dcp350c +pacstats-hg +pacstats +tracy +eternallands-music +cal3d-svn +tracy-git +ncine-git +ncpong-git +ncparticleeditor-git +ncinvaders-git +ncine +ncline-git +eternallands +nctracer-git +nctiledviewer-git +spookyghost-git +postgresqltuner +luniistore +unscd +netatop +scamper +graylog +bitcoinxt +prysm +python-pysword +python2-pysword +mp3rename +plc +python3-sensors-git +python2-bencode +python2-binplist +python2-dfvfs +python2-dfwinreg +python2-artifacts +python2-pytsk3 +python2-libbde +python2-libesedb +python2-libevt +python2-libevtx +python2-libfsntfs +python2-libfwsi +python2-liblnk +python2-libmsiecf +python2-libolecf +python2-libqcow +python2-libregf +python2-libscca +python2-libsigscan +python2-libsmdev +python2-libsmraw +python2-libvhdi +python2-libvmdk +python2-libvshadow +python2-libewf +python2-dfdatetime +libvhdi +python2-acora +python2-efilter +audioconvert +gogglesmm-git +gogglesmm-develop-git +hangupsbot +python-quamash-git +python-reparser +pwgen-passphrase +macchiato-git +ft232r_prog +gr-dsd-git +rtl_power_fftw-git +python2-pyrtlsdr-git +python-pyrtlsdr-git +csdr-git +gr-dab-git +rtlsdr-scanner-git +shinysdr-git diff --git a/evaluation/distr_benchmarks/dependency_untangling/input/setup.sh b/evaluation/distr_benchmarks/dependency_untangling/input/setup.sh new file mode 100755 index 000000000..58ee4bd7d --- /dev/null +++ b/evaluation/distr_benchmarks/dependency_untangling/input/setup.sh @@ -0,0 +1,160 @@ +#!/bin/bash + +# exit when any command fails +#set -e + +IN=$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/ +OUT=$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/output/ +IN_NAME=$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/100G.txt + +if [ "$1" == "-c" ]; then + rm -rf ${IN}/jpg + rm -rf ${IN}/log_data + rm -rf ${IN}/wav + rm -rf ${IN}/nginx-logs + rm -rf ${IN}/node_modules + rm -rf ${IN}/pcap_data + rm -rf ${IN}/pcaps + rm -rf ${IN}/packages + rm -rf ${IN}/mir-sa + rm -rf ${IN}/deps + rm -rf ${IN}/bio + rm -rf ${IN}/output + rm -rf ${OUT} + exit +fi + +setup_dataset() { + if [ "$1" == "--small" ]; then + LOG_DATA_FILES=6 + WAV_DATA_FILES=20 + NODE_MODULE_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/node_modules.zip + BIO_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/bio.zip + JPG_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/jpg.zip + PCAP_DATA_FILES=1 + else + LOG_DATA_FILES=84 + WAV_DATA_FILES=120 + NODE_MODULE_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/node_modules.zip + BIO_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/bio.zip + JPG_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/jpg.zip + PCAP_DATA_FILES=15 + fi + + if [ ! -d ${IN}/wav ]; then + wget http://pac-n4.csail.mit.edu:81/pash_data/wav.zip + unzip wav.zip && cd wav/ + for f in *.wav; do + FILE=$(basename "$f") + for (( i = 0; i <= $WAV_DATA_FILES; i++)) do + echo copying to $f$i.wav + cp $f $f$i.wav + done + done + echo "WAV Generated" + fi + + if [ ! -d ${IN}/jpg ]; then + cd ${IN} + wget $JPG_DATA_LINK + unzip jpg.zip + echo "JPG Generated" + rm -rf ${IN}/jpg.zip + fi + + # download the input for the nginx logs and populate the dataset + if [ ! -d ${IN}/log_data ]; then + cd $IN + wget http://pac-n4.csail.mit.edu:81/pash_data/nginx.zip + unzip nginx.zip + rm nginx.zip + # generating analysis logs + mkdir -p ${IN}/log_data + for (( i = 1; i <=$LOG_DATA_FILES; i++)) do + for j in nginx-logs/*;do + n=$(basename $j) + cat $j > log_data/log${i}_${n}.log; + done + done + echo "Logs Generated" + fi + + if [ ! -d ${IN}/bio ]; then + if [ "$1" = "--small" ]; then + # download the Genome loc file + wget $BIO_DATA_LINK + unzip bio.zip + cd bio + wget http://pac-n4.csail.mit.edu:81/pash_data/Gene_locs.txt + wget http://pac-n4.csail.mit.edu:81/pash_data/small/100G.txt + cd .. + rm bio.zip + else + mkdir ${IN}/bio + cd ${IN}/bio + # download the file containing the links for the dataset + wget http://pac-n4.csail.mit.edu:81/pash_data/100G.txt + # download the Genome loc file + wget http://pac-n4.csail.mit.edu:81/pash_data/Gene_locs.txt + # start downloading the real dataset + cat ${IN_NAME} |while read s_line; + do + echo ${IN_NAME} + sample=$(echo $s_line |cut -d " " -f 2); + if [[ ! -f $sample ]]; then + pop=$(echo $s_line |cut -f 1 -d " "); + link=$(echo $s_line |cut -f 3 -d " "); + wget -O "$sample".bam "$link"; ##this part can be adjusted maybe + fi + done; + fi + echo "Genome data downloaded" + fi + + # download the initial pcaps to populate the whole dataset + if [ ! -d ${IN}/pcap_data ]; then + cd $IN + wget http://pac-n4.csail.mit.edu:81/pash_data/pcaps.zip + unzip pcaps.zip + rm pcaps.zip + mkdir ${IN}/pcap_data/ + # generates 20G + for (( i = 1; i <= $PCAP_DATA_FILES; i++ )) do + for j in ${IN}/pcaps/*;do + n=$(basename $j) + cat $j > pcap_data/pcap${i}_${n}; + done + done + echo "Pcaps Generated" + fi + + # download the modules for the Mir static analyses + if [ ! -d ${IN}/node_modules ]; then + cd $IN + wget $NODE_MODULE_LINK + unzip node_modules.zip + rm node_modules.zip + # download the specific mir version + wget http://pac-n4.csail.mit.edu:81/pash_data/mir-sa.zip + unzip mir-sa.zip + rm mir-sa.zip + echo "Node modules generated" + fi + + # download the packages for the package building + if [ ! -f ${IN}/packages ]; then + cd $IN + wget http://pac-n4.csail.mit.edu:81/pash_data/packages + if [ "$1" = "--small" ]; then + head -n 20 packages > p + mv p packages + fi + echo "Package datset downloaded" + fi +} + +source_var() { + export IN= +} + +setup_dataset diff --git a/evaluation/distr_benchmarks/dependency_untangling/nginx.sh b/evaluation/distr_benchmarks/dependency_untangling/nginx.sh new file mode 100755 index 000000000..afd53af8e --- /dev/null +++ b/evaluation/distr_benchmarks/dependency_untangling/nginx.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# tag: nginx logs +IN=${IN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/log_data} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/nginx-logs} +mkdir -p ${OUT} + +run_tests() { + # i don't think we should assign things to $0, however, it works with both + IN=$1 + cat $IN | cut -d "\"" -f3 | cut -d ' ' -f2 | sort | uniq -c | sort -rn + # awk alternative, too slow + awk '{print $9}' $IN | sort | uniq -c | sort -rn + # find broken links broken links + awk '($9 ~ /404/)' $IN | awk '{print $7}' | sort | uniq -c | sort -rn + # for 502 (bad-gateway) we can run following command: + awk '($9 ~ /502/)' $IN | awk '{print $7}' | sort | uniq -c | sort -r + # Who are requesting broken links (or URLs resulting in 502) + awk -F\" '($2 ~ "/wp-admin/install.php"){print $1}' $IN | awk '{print $1}' | sort | uniq -c | sort -r + # 404 for php files -mostly hacking attempts + awk '($9 ~ /404/)' $IN | awk -F\" '($2 ~ "^GET .*.php")' | awk '{print $7}' | sort | uniq -c | sort -r | head -n 20 + ############################## + # Most requested URLs ######## + awk -F\" '{print $2}' $IN | awk '{print $2}' | sort | uniq -c | sort -r + # Most requested URLs containing XYZ + awk -F\" '($2 ~ "ref"){print $2}' $IN | awk '{print $2}' | sort | uniq -c | sort -r +} + +export -f run_tests +for f in ${IN}/*; do + #bash -c 'run_tests $0 $1' $f $f #> /dev/null + #run_tests $f > /dev/null + logname=$OUT/$(basename $f) + run_tests $f > $logname +done + +echo 'done'; diff --git a/evaluation/distr_benchmarks/dependency_untangling/pacaur.sh b/evaluation/distr_benchmarks/dependency_untangling/pacaur.sh new file mode 100755 index 000000000..b8a76594c --- /dev/null +++ b/evaluation/distr_benchmarks/dependency_untangling/pacaur.sh @@ -0,0 +1,42 @@ +#!/bin/bash +IN=${IN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/packages} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/packages} +LOGS=${OUT}/logs +mkdir -p ${OUT} ${LOGS} + +info() { echo -e "\e[1m--> $@\e[0m"; } +mkcd() { mkdir -p "$1" && cd "$1"; } + +# check if not running as root +# test "$UID" -gt 0 || { info "don't run this as root!"; exit; } + + + +run_tests() { + pgk=$1 + info "create subdirectory for $pkg" + mkcd "${OUT}/$pkg" + + # set link to plaintext PKGBUILDs + pkgbuild="https://aur.archlinux.org/cgit/aur.git/plain/PKGBUILD?h" + + info "fetch PKGBUILD for $pkg" + curl --insecure -o PKGBUILD "$pkgbuild=$pkg" 2> /dev/null|| echo ' ' + + #info "fetch required pgp keys from PKGBUILD" + #gpg --recv-keys $(sed -n "s:^validpgpkeys=('\([0-9A-Fa-fx]\+\)').*$:\1:p" PKGBUILD) + info "make and install ..." + timeout 100 makedeb-makepkg --format-makedeb -d 2>/dev/null|| echo 'failed' + cd - +} + +export -f run_tests +pkg_count=0 +# loop over required packages +for pkg in $(cat ${IN} | tr '\n' ' ' ); +do + pkg_count=$((pkg_count + 1)) + run_tests $pkg > "${LOGS}"/"$pkg_count.log" +done + +echo 'done'; diff --git a/evaluation/distr_benchmarks/dependency_untangling/pcap.sh b/evaluation/distr_benchmarks/dependency_untangling/pcap.sh new file mode 100755 index 000000000..d4e1b70ea --- /dev/null +++ b/evaluation/distr_benchmarks/dependency_untangling/pcap.sh @@ -0,0 +1,25 @@ +#!/bin/bash +#tag: pcap analysis +IN=${IN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/pcap_data} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/pcap-analysis} +LOGS=${OUT}/logs +mkdir -p ${LOGS} +run_tests() { + INPUT=$1 + /usr/sbin/tcpdump -nn -r ${INPUT} -A 'port 53' 2> /dev/null | sort | uniq |grep -Ev '(com|net|org|gov|mil|arpa)' 2> /dev/null + # extract URL + /usr/sbin/tcpdump -nn -r ${INPUT} -s 0 -v -n -l 2> /dev/null | egrep -i "POST /|GET /|Host:" 2> /dev/null + # extract passwords + /usr/sbin/tcpdump -nn -r ${INPUT} -s 0 -A -n -l 2> /dev/null | egrep -i "POST /|pwd=|passwd=|password=|Host:" 2> /dev/null +} +export -f run_tests + +pkg_count=0 + +for item in ${IN}/*; +do + pkg_count=$((pkg_count + 1)); + run_tests $item > ${LOGS}/${pkg_count}.log +done + +echo 'done'; diff --git a/evaluation/distr_benchmarks/dependency_untangling/proginf.sh b/evaluation/distr_benchmarks/dependency_untangling/proginf.sh new file mode 100755 index 000000000..52f33fd04 --- /dev/null +++ b/evaluation/distr_benchmarks/dependency_untangling/proginf.sh @@ -0,0 +1,18 @@ +#!/bin/bash +IN=${IN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/node_modules} +MIR_BIN=${MIR_BIN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/mir-sa/.bin/mir-sa} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/mir} +mkdir -p ${OUT}/ +pkg_count=0 +run_tests() { + cd $1; + ${MIR_BIN} -p 2>>${OUT}/error.log +} +export -f run_tests +for item in ${IN}/*; +do + pkg_count=$((pkg_count + 1)); + run_tests $item > ${OUT}/$pkg_count.log +done + +echo 'done'; diff --git a/evaluation/distr_benchmarks/dependency_untangling/run.distr.sh b/evaluation/distr_benchmarks/dependency_untangling/run.distr.sh new file mode 100755 index 000000000..8928ed6be --- /dev/null +++ b/evaluation/distr_benchmarks/dependency_untangling/run.distr.sh @@ -0,0 +1,90 @@ +PASH_FLAGS='--width 6 --r_split' +export TIMEFORMAT=%R +export dict="$PASH_TOP/evaluation/benchmarks/oneliners/input/dict.txt" + +names_scripts=( + "MediaConv1;img_convert" + "MediaConv2;to_mp3" + "Program_Inference;proginf" + "LogAnalysis1;nginx" + "LogAnalysis2;pcap" + # "Genomics_Computation;genomics" + "AurPkg;pacaur" + "FileEnc1;compress_files" + "FileEnc2;encrypt_files" + ) + +oneliners_bash() { + seq_times_file="seq.res" + seq_outputs_suffix="seq.out" + outputs_dir="outputs" + + mkdir -p "$outputs_dir" + + touch "$seq_times_file" + cat $seq_times_file > $seq_times_file.d + echo executing one-liners $(date) | tee -a "$seq_times_file" + echo '' > "$seq_times_file" + + for name_script in ${names_scripts[@]} + do + IFS=";" read -r -a name_script_parsed <<< "${name_script}" + name="${name_script_parsed[0]}" + script="${name_script_parsed[1]}" + export IN= + export OUT= + + printf -v pad %30s + padded_script="${script}${pad}" + padded_script=${padded_script:0:30} + + seq_outputs_file="${outputs_dir}/${script}.${seq_outputs_suffix}" + + echo "${padded_script}" $({ time ./${script}.sh > "$seq_outputs_file"; } 2>&1) | tee -a "$seq_times_file" + done +} + +oneliners_pash(){ + flags=${1:-$PASH_FLAGS} + prefix=${2:-par} + + times_file="$prefix.res" + outputs_suffix="$prefix.out" + time_suffix="$prefix.time" + outputs_dir="outputs" + pash_logs_dir="pash_logs_$prefix" + + mkdir -p "$outputs_dir" + mkdir -p "$pash_logs_dir" + + touch "$times_file" + cat $times_file > $times_file.d + echo executing one-liners with $prefix pash $(date) | tee -a "$times_file" + echo '' > "$times_file" + + for name_script in ${names_scripts[@]} + do + IFS=";" read -r -a name_script_parsed <<< "${name_script}" + name="${name_script_parsed[0]}" + script="${name_script_parsed[1]}" + + export IN= + export OUT= + + printf -v pad %30s + padded_script="${script}${pad}" + padded_script=${padded_script:0:30} + + outputs_file="${outputs_dir}/${script}.${outputs_suffix}" + pash_log="${pash_logs_dir}/${script}.pash.log" + single_time_file="${outputs_dir}/${script}.${time_suffix}" + + echo -n "${padded_script}" | tee -a "$times_file" + { time "$PASH_TOP/pa.sh" $flags --log_file "${pash_log}" ${script}.sh > "$outputs_file"; } 2> "${single_time_file}" + cat "${single_time_file}" | tee -a "$times_file" + done +} + +# oneliners_bash +oneliners_pash "$PASH_FLAGS" "par" +# oneliners_pash "$PASH_FLAGS --distributed_exec" "distr" diff --git a/evaluation/distr_benchmarks/dependency_untangling/to_mp3.sh b/evaluation/distr_benchmarks/dependency_untangling/to_mp3.sh new file mode 100755 index 000000000..1f84bb277 --- /dev/null +++ b/evaluation/distr_benchmarks/dependency_untangling/to_mp3.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# tag: wav-to-mp3 +IN=${IN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/wav} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/mp3} +LOGS=${OUT}/logs +mkdir -p ${LOGS} +trigrams_aux(){ + ffmpeg -y -i pipe:0 -f mp3 -ab 192000 pipe:1 2>/dev/null +} + +export -f trigrams_aux + +pkg_count=0 +for item in $(hdfs dfs -ls -C /for-loops/wav); +do + pkg_count=$((pkg_count + 1)); + out="$OUT/$(basename $item).mp3" + hdfs dfs -cat $item | trigrams_aux > $out +done + +echo 'done'; From a11812d6d4970ceb3bbd657c9a48818a736c5573 Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Tue, 26 Apr 2022 13:44:10 +0000 Subject: [PATCH 02/37] porting oneliners to use hdfs --- .../distr_benchmarks/oneliners/.gitignore | 1 + .../distr_benchmarks/oneliners/README.md | 18 ++++ .../distr_benchmarks/oneliners/bi-gram.aux.sh | 96 +++++++++++++++++++ .../distr_benchmarks/oneliners/bi-grams.sh | 15 +++ evaluation/distr_benchmarks/oneliners/diff.sh | 21 ++++ .../oneliners/input/.gitignore | 3 + .../distr_benchmarks/oneliners/input/setup.sh | 91 ++++++++++++++++++ .../distr_benchmarks/oneliners/nfa-regex.sh | 6 ++ .../distr_benchmarks/oneliners/run.distr.sh | 90 +++++++++++++++++ .../distr_benchmarks/oneliners/set-diff.sh | 20 ++++ .../oneliners/shortest-scripts.sh | 11 +++ .../distr_benchmarks/oneliners/sort-sort.sh | 6 ++ evaluation/distr_benchmarks/oneliners/sort.sh | 7 ++ .../distr_benchmarks/oneliners/spell.sh | 16 ++++ .../distr_benchmarks/oneliners/top-n.sh | 8 ++ evaluation/distr_benchmarks/oneliners/wf.sh | 6 ++ 16 files changed, 415 insertions(+) create mode 100644 evaluation/distr_benchmarks/oneliners/.gitignore create mode 100644 evaluation/distr_benchmarks/oneliners/README.md create mode 100755 evaluation/distr_benchmarks/oneliners/bi-gram.aux.sh create mode 100755 evaluation/distr_benchmarks/oneliners/bi-grams.sh create mode 100755 evaluation/distr_benchmarks/oneliners/diff.sh create mode 100644 evaluation/distr_benchmarks/oneliners/input/.gitignore create mode 100755 evaluation/distr_benchmarks/oneliners/input/setup.sh create mode 100755 evaluation/distr_benchmarks/oneliners/nfa-regex.sh create mode 100755 evaluation/distr_benchmarks/oneliners/run.distr.sh create mode 100755 evaluation/distr_benchmarks/oneliners/set-diff.sh create mode 100755 evaluation/distr_benchmarks/oneliners/shortest-scripts.sh create mode 100755 evaluation/distr_benchmarks/oneliners/sort-sort.sh create mode 100755 evaluation/distr_benchmarks/oneliners/sort.sh create mode 100755 evaluation/distr_benchmarks/oneliners/spell.sh create mode 100755 evaluation/distr_benchmarks/oneliners/top-n.sh create mode 100755 evaluation/distr_benchmarks/oneliners/wf.sh diff --git a/evaluation/distr_benchmarks/oneliners/.gitignore b/evaluation/distr_benchmarks/oneliners/.gitignore new file mode 100644 index 000000000..2211df63d --- /dev/null +++ b/evaluation/distr_benchmarks/oneliners/.gitignore @@ -0,0 +1 @@ +*.txt diff --git a/evaluation/distr_benchmarks/oneliners/README.md b/evaluation/distr_benchmarks/oneliners/README.md new file mode 100644 index 000000000..cf6a5381a --- /dev/null +++ b/evaluation/distr_benchmarks/oneliners/README.md @@ -0,0 +1,18 @@ +## Expert One-liners + +This directory contains ten scripts collected by several sources, including GitHub, Stackoverflow, and the Unix literature. +They are written by developers who are (or approximate) experts in Unix shell scripting, and include several Unix classics. + +1. `nfa-regex.sh` Match complex regular-expression over input +2. `sort.sh` Sort a text input +3. `top-n.sh` Find the top 1000 terms in a document +4. `wf.sh` Calculate the frequency of each word in the document, and sort by frequency +5. `spell.sh` Compute mispelled words in an input document +6. `bi-grams.sh` Find all 2-grams in a piece of text +7. `diff.sh` Compares two streams element by element +8. `set-diff.sh` Show the set-difference between two streams (i.e., elements in the first that are not in the second). +9. `shortest-scripts.sh` Find the shortest scripts +10.`sort-sort.sh` Calculate sort twice + +The `bi-grams.aux.sh` script contains helper functions for `bi-grams.sh`. +To generate inputs, run `./generate_inputs`. diff --git a/evaluation/distr_benchmarks/oneliners/bi-gram.aux.sh b/evaluation/distr_benchmarks/oneliners/bi-gram.aux.sh new file mode 100755 index 000000000..5f66058b2 --- /dev/null +++ b/evaluation/distr_benchmarks/oneliners/bi-gram.aux.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# Auxiliary functions for bi-grams + +bigrams_aux() +{ + s2=$(mktemp -u) + mkfifo $s2 + tee $s2 | + tail -n +2 | + paste $s2 - | + sed '$d' + rm $s2 +} + +bigram_aux_map() +{ + IN=$1 + OUT=$2 + AUX_HEAD=$3 + AUX_TAIL=$4 + + s2=$(mktemp -u) + aux1=$(mktemp -u) + aux2=$(mktemp -u) + aux3=$(mktemp -u) + temp=$(mktemp -u) + + mkfifo $s2 + mkfifo $aux1 + mkfifo $aux2 + mkfifo $aux3 + + ## New way of doing it using an intermediate file. This is slow + ## but doesn't deadlock + cat $IN > $temp + + sed '$d' $temp > $aux3 & + cat $temp | head -n 1 > $AUX_HEAD & + cat $temp | tail -n 1 > $AUX_TAIL & + cat $temp | tail -n +2 | paste $aux3 - > $OUT & + + # ## Old way of doing it + # cat $IN | + # tee $s2 $aux1 $aux2 | + # tail -n +2 | + # paste $s2 - > $OUT & + + # ## The goal of this is to write the first line of $IN in the $AUX_HEAD + # ## stream and the last line of $IN in $AUX_TAIL + + # cat $aux1 | ( head -n 1 > $AUX_HEAD; $PASH_TOP/evaluation/tools/drain_stream.sh ) & + # # while IFS= read -r line + # # do + # # old_line=$line + # # done < $aux2 + # # echo "$old_line" > $AUX_TAIL + # ( tail -n 1 $aux2 > $AUX_TAIL; $PASH_TOP/evaluation/tools/drain_stream.sh ) & + + wait + + rm $temp + rm $s2 + rm $aux1 + rm $aux2 + rm $aux3 +} + +bigram_aux_reduce() +{ + IN1=$1 + AUX_HEAD1=$2 + AUX_TAIL1=$3 + IN2=$4 + AUX_HEAD2=$5 + AUX_TAIL2=$6 + OUT=$7 + AUX_HEAD_OUT=$8 + AUX_TAIL_OUT=$9 + + temp=$(mktemp -u) + + mkfifo $temp + + cat $AUX_HEAD1 > $AUX_HEAD_OUT & + cat $AUX_TAIL2 > $AUX_TAIL_OUT & + paste $AUX_TAIL1 $AUX_HEAD2 > $temp & + cat $IN1 $temp $IN2 > $OUT & + + wait + + rm $temp +} + +export -f bigrams_aux +export -f bigram_aux_map +export -f bigram_aux_reduce diff --git a/evaluation/distr_benchmarks/oneliners/bi-grams.sh b/evaluation/distr_benchmarks/oneliners/bi-grams.sh new file mode 100755 index 000000000..a081a05ec --- /dev/null +++ b/evaluation/distr_benchmarks/oneliners/bi-grams.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# Find all 2-grams in a piece of text + +IN=${IN:-/1G.txt} + +. bi-gram.aux.sh + +hdfs dfs -cat $IN | + tr -cs A-Za-z '\n' | + tr A-Z a-z | + bigrams_aux | + sort | + uniq + + diff --git a/evaluation/distr_benchmarks/oneliners/diff.sh b/evaluation/distr_benchmarks/oneliners/diff.sh new file mode 100755 index 000000000..9435ad1d7 --- /dev/null +++ b/evaluation/distr_benchmarks/oneliners/diff.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Compares two streams element by element +# Taken from https://crashingdaily.wordpress.com/2008/03/06/diff-two-stdout-streams/ +# shuf() { awk 'BEGIN {srand(); OFMT="%.17f"} {print rand(), $0}' "$@" | sort -k1,1n | cut -d ' ' -f2-; } + +IN=${IN:-/1G.txt} + +mkfifo s1 s2 + +hdfs dfs -cat $IN | + # shuf | + tr [:lower:] [:upper:] | + sort > s1 & + +hdfs dfs -cat $IN | + # shuf | + tr [:upper:] [:lower:] | + sort > s2 & + +diff -B s1 s2 +rm s1 s2 diff --git a/evaluation/distr_benchmarks/oneliners/input/.gitignore b/evaluation/distr_benchmarks/oneliners/input/.gitignore new file mode 100644 index 000000000..047dcd20b --- /dev/null +++ b/evaluation/distr_benchmarks/oneliners/input/.gitignore @@ -0,0 +1,3 @@ +* +!.gitignore +!setup.sh diff --git a/evaluation/distr_benchmarks/oneliners/input/setup.sh b/evaluation/distr_benchmarks/oneliners/input/setup.sh new file mode 100755 index 000000000..eb08a2d42 --- /dev/null +++ b/evaluation/distr_benchmarks/oneliners/input/setup.sh @@ -0,0 +1,91 @@ +#!/bin/bash + +#set -e + +PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)} + +# another solution for capturing HTTP status code +# https://superuser.com/a/590170 +input_files="1M.txt 10M.txt 100M.txt 1G.txt dict.txt 3G.txt 10G.txt 100G.txt all_cmds.txt all_cmdsx100.txt small" + +if [ ! -f ./1M.txt ]; then + curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt + if [ $? -ne 0 ]; then + echo 'cannot find 1M.txt -- please contact the developers of pash' + exit 1 + fi +fi + +if [ ! -f ./10M.txt ]; then + touch 10M.txt + for (( i = 0; i < 10; i++ )); do + cat 1M.txt >> 10M.txt + done +fi + +if [ ! -f ./100M.txt ]; then + touch 100M.txt + for (( i = 0; i < 10; i++ )); do + cat 10M.txt >> 100M.txt + done +fi + +if [ ! -f ./1G.txt ]; then + curl -sf 'http://ndr.md/data/dummy/1G.txt' > 1G.txt + if [ $? -ne 0 ]; then + echo 'cannot find 1G.txt -- please contact the developers of pash' + exit 1 + fi +fi + +# download wamerican-insane dictionary and sort according to machine +if [ ! -f ./dict.txt ]; then + curl -sf 'http://ndr.md/data/dummy/dict.txt' | sort > dict.txt + if [ $? -ne 0 ]; then + echo 'cannot find dict.txt -- please contact the developers of pash' + exit 1 + fi +fi + +if [ ! -f ./all_cmds.txt ]; then + curl -sf 'http://ndr.md/data/dummy/all_cmds.txt' > all_cmds.txt + if [ $? -ne 0 ]; then + # This should be OK for tests, no need for abort + ls /usr/bin/* > all_cmds.txt + fi +fi + +hdfs dfs -put ./10M.txt /10M.txt +hdfs dfs -put ./100M.txt /100M.txt +hdfs dfs -put ./1G.txt /1G.txt +hdfs dfs -put ./all_cmds.txt + + +if [ "$#" -eq 1 ] && [ "$1" = "--full" ]; then + echo "Generating full-size inputs" + # FIXME PR: Do we need all of them? + + # if [ ! -f ./3G.txt ]; then + # touch 3G.txt + # for (( i = 0; i < 3; i++ )); do + # cat 1G.txt >> 3G.txt + # done + # fi + # hdfs dfs -put ./3G.txt /3G.txt + + # if [ ! -f ./10G.txt ]; then + # touch 10G.txt + # for (( i = 0; i < 10; i++ )); do + # cat 1G.txt >> 10G.txt + # done + # fi + # hdfs dfs -put ./10G.txt /10G.txt + + if [ ! -f ./all_cmdsx100.txt ]; then + touch all_cmdsx100.txt + for (( i = 0; i < 100; i++ )); do + cat all_cmds.txt >> all_cmdsx100.txt + done + fi + hdfs dfs -put ./all_cmdsx100.txt /all_cmdsx100.txt +fi \ No newline at end of file diff --git a/evaluation/distr_benchmarks/oneliners/nfa-regex.sh b/evaluation/distr_benchmarks/oneliners/nfa-regex.sh new file mode 100755 index 000000000..2a2c30718 --- /dev/null +++ b/evaluation/distr_benchmarks/oneliners/nfa-regex.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# Match complex regular-expression over input + +IN=${IN:-/1G.txt} + +hdfs dfs -cat $IN | tr A-Z a-z | grep '\(.\).*\1\(.\).*\2\(.\).*\3\(.\).*\4' diff --git a/evaluation/distr_benchmarks/oneliners/run.distr.sh b/evaluation/distr_benchmarks/oneliners/run.distr.sh new file mode 100755 index 000000000..7a3b4a4f2 --- /dev/null +++ b/evaluation/distr_benchmarks/oneliners/run.distr.sh @@ -0,0 +1,90 @@ +PASH_FLAGS='--width 6 --r_split' +export TIMEFORMAT=%R +export dict="$PASH_TOP/evaluation/benchmarks/oneliners/input/dict.txt" + +scripts_inputs=( + "nfa-regex;100M.txt" + "sort;3G.txt" + "top-n;3G.txt" + "wf;3G.txt" + "spell;3G.txt" + "diff;3G.txt" + "bi-grams;3G.txt" + "set-diff;3G.txt" + "sort-sort;3G.txt" + "shortest-scripts;all_cmdsx100.txt" + ) + +oneliners_bash() { + seq_times_file="seq.res" + seq_outputs_suffix="seq.out" + outputs_dir="outputs" + + mkdir -p "$outputs_dir" + + touch "$seq_times_file" + cat $seq_times_file > $seq_times_file.d + echo executing one-liners $(date) | tee -a "$seq_times_file" + echo '' > "$seq_times_file" + + for script_input in ${scripts_inputs[@]} + do + IFS=";" read -r -a script_input_parsed <<< "${script_input}" + script="${script_input_parsed[0]}" + input="${script_input_parsed[1]}" + + export IN="/$input" + + printf -v pad %30s + padded_script="${script}${pad}" + padded_script=${padded_script:0:30} + + seq_outputs_file="${outputs_dir}/${script}.${seq_outputs_suffix}" + + echo "${padded_script}" $({ time ./${script}.sh > "$seq_outputs_file"; } 2>&1) | tee -a "$seq_times_file" + done +} + +oneliners_pash(){ + flags=${1:-$PASH_FLAGS} + prefix=${2:-par} + + times_file="$prefix.res" + outputs_suffix="$prefix.out" + time_suffix="$prefix.time" + outputs_dir="outputs" + pash_logs_dir="pash_logs_$prefix" + + mkdir -p "$outputs_dir" + mkdir -p "$pash_logs_dir" + + touch "$times_file" + cat $times_file > $times_file.d + echo executing one-liners with $prefix pash $(date) | tee -a "$times_file" + echo '' > "$times_file" + + for script_input in ${scripts_inputs[@]} + do + IFS=";" read -r -a script_input_parsed <<< "${script_input}" + script="${script_input_parsed[0]}" + input="${script_input_parsed[1]}" + + export IN="/$input" + + printf -v pad %30s + padded_script="${script}${pad}" + padded_script=${padded_script:0:30} + + outputs_file="${outputs_dir}/${script}.${outputs_suffix}" + pash_log="${pash_logs_dir}/${script}.pash.log" + single_time_file="${outputs_dir}/${script}.${time_suffix}" + + echo -n "${padded_script}" | tee -a "$times_file" + { time "$PASH_TOP/pa.sh" $flags --log_file "${pash_log}" ${script}.sh > "$outputs_file"; } 2> "${single_time_file}" + cat "${single_time_file}" | tee -a "$times_file" + done +} + +# oneliners_bash +oneliners_pash "$PASH_FLAGS" "par" +oneliners_pash "$PASH_FLAGS --distributed_exec" "distr" diff --git a/evaluation/distr_benchmarks/oneliners/set-diff.sh b/evaluation/distr_benchmarks/oneliners/set-diff.sh new file mode 100755 index 000000000..039e6996f --- /dev/null +++ b/evaluation/distr_benchmarks/oneliners/set-diff.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# Show the set-difference between two streams (i.e., elements in the first that are not in the second). +# https://stackoverflow.com/questions/2509533/bash-linux-set-difference-between-two-text-files + +IN=${IN:-/1G.txt} + +mkfifo s1 s2 + +hdfs dfs -cat $IN | + cut -d ' ' -f 1 | + tr [:lower:] [:upper:] | + sort > s1 & + +hdfs dfs -cat $IN | + cut -d ' ' -f 1 | + sort > s2 & + +comm -23 s1 s2 + +rm s1 s2 diff --git a/evaluation/distr_benchmarks/oneliners/shortest-scripts.sh b/evaluation/distr_benchmarks/oneliners/shortest-scripts.sh new file mode 100755 index 000000000..f6bac1b15 --- /dev/null +++ b/evaluation/distr_benchmarks/oneliners/shortest-scripts.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# Find the shortest scripts +# From "Wicked Cool Shell Scripts", 2nd Ed., pg. 7 +# +p.95 multiple sed +# +p.XX crawler + +# FIX: Input here should be a set of commands, more precisely, the ones on this specific machine. + +IN=${IN:-/all_cmds.txt} + +hdfs dfs -cat $IN | xargs file | grep "shell script" | cut -d: -f1 | xargs -L 1 wc -l | grep -v '^0$' | sort -n | head -15 diff --git a/evaluation/distr_benchmarks/oneliners/sort-sort.sh b/evaluation/distr_benchmarks/oneliners/sort-sort.sh new file mode 100755 index 000000000..7b51ed889 --- /dev/null +++ b/evaluation/distr_benchmarks/oneliners/sort-sort.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# Calculate sort twice + +IN=${IN:-/1G.txt} + +hdfs dfs -cat $IN | tr A-Z a-z | sort | sort -r diff --git a/evaluation/distr_benchmarks/oneliners/sort.sh b/evaluation/distr_benchmarks/oneliners/sort.sh new file mode 100755 index 000000000..29cffa1cf --- /dev/null +++ b/evaluation/distr_benchmarks/oneliners/sort.sh @@ -0,0 +1,7 @@ +#!/bin/bash +# Sort input + +IN=${IN:-/1G.txt} + +hdfs dfs -cat $IN | sort + diff --git a/evaluation/distr_benchmarks/oneliners/spell.sh b/evaluation/distr_benchmarks/oneliners/spell.sh new file mode 100755 index 000000000..a5803a5c5 --- /dev/null +++ b/evaluation/distr_benchmarks/oneliners/spell.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# Calculate mispelled words in an input +# https://dl.acm.org/doi/10.1145/3532.315102 +IN=${IN:-/1G.txt} +dict=${dict:-$PASH_TOP/evaluation/benchmarks/oneliners/input/dict.txt} + +hdfs dfs -cat $IN | + iconv -f utf-8 -t ascii//translit | # remove non utf8 characters + # groff -t -e -mandoc -Tascii | # remove formatting commands + col -bx | # remove backspaces / linefeeds + tr -cs A-Za-z '\n' | + tr A-Z a-z | # map upper to lower case + tr -d '[:punct:]' | # remove punctuation + sort | # put words in alphabetical order + uniq | # remove duplicate words + comm -23 - $dict # report words not in dictionary diff --git a/evaluation/distr_benchmarks/oneliners/top-n.sh b/evaluation/distr_benchmarks/oneliners/top-n.sh new file mode 100755 index 000000000..ac6fbb50e --- /dev/null +++ b/evaluation/distr_benchmarks/oneliners/top-n.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# Top-N (1000) terms +# from https://dl.acm.org/doi/10.1145/5948.315654 + +IN=${IN:-/1G.txt} + +hdfs dfs -cat $IN | tr -cs A-Za-z '\n' | tr A-Z a-z | sort | uniq -c | sort -rn | sed 100q + diff --git a/evaluation/distr_benchmarks/oneliners/wf.sh b/evaluation/distr_benchmarks/oneliners/wf.sh new file mode 100755 index 000000000..a8a885775 --- /dev/null +++ b/evaluation/distr_benchmarks/oneliners/wf.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# Calculate the frequency of each word in the document, and sort by frequency + +IN=${IN:-/10M.txt} + +hdfs dfs -cat $IN | tr -cs A-Za-z '\n' | tr A-Z a-z | sort | uniq -c | sort -rn From 748f7ab79e933d4107c483d0f85ad0a33f39bf0f Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Tue, 26 Apr 2022 13:47:11 +0000 Subject: [PATCH 03/37] added gitingore for dependecy_untagling --- .../distr_benchmarks/dependency_untangling/input/.gitignore | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 evaluation/distr_benchmarks/dependency_untangling/input/.gitignore diff --git a/evaluation/distr_benchmarks/dependency_untangling/input/.gitignore b/evaluation/distr_benchmarks/dependency_untangling/input/.gitignore new file mode 100644 index 000000000..85940a3d2 --- /dev/null +++ b/evaluation/distr_benchmarks/dependency_untangling/input/.gitignore @@ -0,0 +1,4 @@ +* +!.gitignore +!setup.sh +!install-deps.sh From f89cbddb78e993d9d251d31382207c0100ff7014 Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Wed, 27 Apr 2022 20:08:05 +0000 Subject: [PATCH 04/37] add stateless and pure function annotations Signed-off-by: Tammam Mustafa --- annotations/pure_func.json | 12 ++++++++++++ annotations/stateless_func.json | 12 ++++++++++++ .../distr_benchmarks/dependency_untangling/to_mp3.sh | 6 +++--- 3 files changed, 27 insertions(+), 3 deletions(-) create mode 100644 annotations/pure_func.json create mode 100644 annotations/stateless_func.json diff --git a/annotations/pure_func.json b/annotations/pure_func.json new file mode 100644 index 000000000..1d9163a3f --- /dev/null +++ b/annotations/pure_func.json @@ -0,0 +1,12 @@ +{ + "command": "pure_func", + "cases": + [ + { + "predicate": "default", + "class": "pure", + "inputs": ["stdin"], + "outputs": ["stdout"] + } + ] +} diff --git a/annotations/stateless_func.json b/annotations/stateless_func.json new file mode 100644 index 000000000..f6a62ec6c --- /dev/null +++ b/annotations/stateless_func.json @@ -0,0 +1,12 @@ +{ + "command": "stateless_func", + "cases": + [ + { + "predicate": "default", + "class": "stateless", + "inputs": ["stdin"], + "outputs": ["stdout"] + } + ] +} diff --git a/evaluation/distr_benchmarks/dependency_untangling/to_mp3.sh b/evaluation/distr_benchmarks/dependency_untangling/to_mp3.sh index 1f84bb277..c94a75b49 100755 --- a/evaluation/distr_benchmarks/dependency_untangling/to_mp3.sh +++ b/evaluation/distr_benchmarks/dependency_untangling/to_mp3.sh @@ -4,18 +4,18 @@ IN=${IN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/wav} OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/mp3} LOGS=${OUT}/logs mkdir -p ${LOGS} -trigrams_aux(){ +pure_func(){ ffmpeg -y -i pipe:0 -f mp3 -ab 192000 pipe:1 2>/dev/null } -export -f trigrams_aux +export -f pure_func pkg_count=0 for item in $(hdfs dfs -ls -C /for-loops/wav); do pkg_count=$((pkg_count + 1)); out="$OUT/$(basename $item).mp3" - hdfs dfs -cat $item | trigrams_aux > $out + hdfs dfs -cat $item | pure_func > $out done echo 'done'; From ab5c3b94984f3464f71e98fee9ea4609feb30b81 Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Wed, 27 Apr 2022 21:52:48 +0000 Subject: [PATCH 05/37] improve oneliners hdfs setup script Signed-off-by: Tammam Mustafa --- .../distr_benchmarks/oneliners/input/setup.sh | 60 ++++++++++--------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/evaluation/distr_benchmarks/oneliners/input/setup.sh b/evaluation/distr_benchmarks/oneliners/input/setup.sh index eb08a2d42..b7d0eb1e7 100755 --- a/evaluation/distr_benchmarks/oneliners/input/setup.sh +++ b/evaluation/distr_benchmarks/oneliners/input/setup.sh @@ -3,10 +3,12 @@ #set -e PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)} +REPLICATION_FACTOR=2 # another solution for capturing HTTP status code # https://superuser.com/a/590170 -input_files="1M.txt 10M.txt 100M.txt 1G.txt dict.txt 3G.txt 10G.txt 100G.txt all_cmds.txt all_cmdsx100.txt small" +input_files=("1M.txt" "10M.txt" "100M.txt" "1G.txt" "all_cmds.txt" "all_cmdsx100.txt") +local_fils=("dict.txt") if [ ! -f ./1M.txt ]; then curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt @@ -55,37 +57,37 @@ if [ ! -f ./all_cmds.txt ]; then fi fi -hdfs dfs -put ./10M.txt /10M.txt -hdfs dfs -put ./100M.txt /100M.txt -hdfs dfs -put ./1G.txt /1G.txt -hdfs dfs -put ./all_cmds.txt +if [ ! -f ./all_cmdsx100.txt ]; then + touch all_cmdsx100.txt + for (( i = 0; i < 100; i++ )); do + cat all_cmds.txt >> all_cmdsx100.txt + done +fi if [ "$#" -eq 1 ] && [ "$1" = "--full" ]; then echo "Generating full-size inputs" - # FIXME PR: Do we need all of them? - - # if [ ! -f ./3G.txt ]; then - # touch 3G.txt - # for (( i = 0; i < 3; i++ )); do - # cat 1G.txt >> 3G.txt - # done - # fi - # hdfs dfs -put ./3G.txt /3G.txt - - # if [ ! -f ./10G.txt ]; then - # touch 10G.txt - # for (( i = 0; i < 10; i++ )); do - # cat 1G.txt >> 10G.txt - # done - # fi - # hdfs dfs -put ./10G.txt /10G.txt - - if [ ! -f ./all_cmdsx100.txt ]; then - touch all_cmdsx100.txt - for (( i = 0; i < 100; i++ )); do - cat all_cmds.txt >> all_cmdsx100.txt + + + if [ ! -f ./3G.txt ]; then + touch 3G.txt + for (( i = 0; i < 3; i++ )); do + cat 1G.txt >> 3G.txt + done + fi + input_files+=("3G.txt") + + if [ ! -f ./10G.txt ]; then + touch 10G.txt + for (( i = 0; i < 10; i++ )); do + cat 1G.txt >> 10G.txt done fi - hdfs dfs -put ./all_cmdsx100.txt /all_cmdsx100.txt -fi \ No newline at end of file + input_files+=("10G.txt") +fi + + +for file in "${input_files[@]}"; do + hdfs dfs -Ddfs.replication=$REPLICATION_FACTOR -put $file /$file + rm $file # remove local file after putting it into hdfs +done \ No newline at end of file From 32a0b86fd3b30879eb91074453bea888f090e4d4 Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Sun, 1 May 2022 12:51:46 +0000 Subject: [PATCH 06/37] fix path bug in spell.sh --- evaluation/distr_benchmarks/oneliners/spell.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evaluation/distr_benchmarks/oneliners/spell.sh b/evaluation/distr_benchmarks/oneliners/spell.sh index a5803a5c5..7928babe4 100755 --- a/evaluation/distr_benchmarks/oneliners/spell.sh +++ b/evaluation/distr_benchmarks/oneliners/spell.sh @@ -2,7 +2,7 @@ # Calculate mispelled words in an input # https://dl.acm.org/doi/10.1145/3532.315102 IN=${IN:-/1G.txt} -dict=${dict:-$PASH_TOP/evaluation/benchmarks/oneliners/input/dict.txt} +dict=${dict:-$PASH_TOP/evaluation/distr_benchmarks/oneliners/input/dict.txt} hdfs dfs -cat $IN | iconv -f utf-8 -t ascii//translit | # remove non utf8 characters From 2b0876b0eda04510fdf33f8dd2e57920871f8094 Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Sun, 1 May 2022 13:00:17 +0000 Subject: [PATCH 07/37] allow varying replication factor in tests --- .../distr_benchmarks/oneliners/input/setup.sh | 5 +++-- .../distr_benchmarks/oneliners/run.distr.sh | 20 +++++++++++++------ .../oneliners/shortest-scripts.sh | 2 +- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/evaluation/distr_benchmarks/oneliners/input/setup.sh b/evaluation/distr_benchmarks/oneliners/input/setup.sh index b7d0eb1e7..0ea6efd6c 100755 --- a/evaluation/distr_benchmarks/oneliners/input/setup.sh +++ b/evaluation/distr_benchmarks/oneliners/input/setup.sh @@ -86,8 +86,9 @@ if [ "$#" -eq 1 ] && [ "$1" = "--full" ]; then input_files+=("10G.txt") fi - +# Add files with different replication factors for file in "${input_files[@]}"; do - hdfs dfs -Ddfs.replication=$REPLICATION_FACTOR -put $file /$file + hdfs dfs -Ddfs.replication=1 -put $file /rep1_$file + hdfs dfs -Ddfs.replication=3 -put $file /rep3_$file rm $file # remove local file after putting it into hdfs done \ No newline at end of file diff --git a/evaluation/distr_benchmarks/oneliners/run.distr.sh b/evaluation/distr_benchmarks/oneliners/run.distr.sh index 7a3b4a4f2..699a23228 100755 --- a/evaluation/distr_benchmarks/oneliners/run.distr.sh +++ b/evaluation/distr_benchmarks/oneliners/run.distr.sh @@ -19,6 +19,7 @@ oneliners_bash() { seq_times_file="seq.res" seq_outputs_suffix="seq.out" outputs_dir="outputs" + rep=${3:-rep3} mkdir -p "$outputs_dir" @@ -33,7 +34,7 @@ oneliners_bash() { script="${script_input_parsed[0]}" input="${script_input_parsed[1]}" - export IN="/$input" + export IN="/$rep\_$input" printf -v pad %30s padded_script="${script}${pad}" @@ -48,6 +49,8 @@ oneliners_bash() { oneliners_pash(){ flags=${1:-$PASH_FLAGS} prefix=${2:-par} + rep=${3:-rep3} + prefix=$prefix\_$rep times_file="$prefix.res" outputs_suffix="$prefix.out" @@ -60,7 +63,7 @@ oneliners_pash(){ touch "$times_file" cat $times_file > $times_file.d - echo executing one-liners with $prefix pash $(date) | tee -a "$times_file" + echo executing one-liners with $prefix pash with data $rep $(date) | tee -a "$times_file" echo '' > "$times_file" for script_input in ${scripts_inputs[@]} @@ -69,7 +72,7 @@ oneliners_pash(){ script="${script_input_parsed[0]}" input="${script_input_parsed[1]}" - export IN="/$input" + export IN="/$rep\_$input" printf -v pad %30s padded_script="${script}${pad}" @@ -85,6 +88,11 @@ oneliners_pash(){ done } -# oneliners_bash -oneliners_pash "$PASH_FLAGS" "par" -oneliners_pash "$PASH_FLAGS --distributed_exec" "distr" +oneliners_bash "rep1" +oneliners_bash "rep3" + +oneliners_pash "$PASH_FLAGS" "par" "rep1" +oneliners_pash "$PASH_FLAGS" "par" "rep3" + +oneliners_pash "$PASH_FLAGS --distributed_exec" "distr" "rep1" +oneliners_pash "$PASH_FLAGS --distributed_exec" "distr" "rep3" diff --git a/evaluation/distr_benchmarks/oneliners/shortest-scripts.sh b/evaluation/distr_benchmarks/oneliners/shortest-scripts.sh index f6bac1b15..63a5bc3d9 100755 --- a/evaluation/distr_benchmarks/oneliners/shortest-scripts.sh +++ b/evaluation/distr_benchmarks/oneliners/shortest-scripts.sh @@ -6,6 +6,6 @@ # FIX: Input here should be a set of commands, more precisely, the ones on this specific machine. -IN=${IN:-/all_cmds.txt} +IN=${IN:-/all_cmdsx100.txt} hdfs dfs -cat $IN | xargs file | grep "shell script" | cut -d: -f1 | xargs -L 1 wc -l | grep -v '^0$' | sort -n | head -15 From 906074254ed7c1ad930cbddfe521fc4da1113ebc Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Wed, 4 May 2022 00:12:00 +0000 Subject: [PATCH 08/37] improve benchmark scripts --- evaluation/distr_benchmarks/oneliners/input/setup.sh | 8 +++++--- evaluation/distr_benchmarks/oneliners/run.distr.sh | 12 +++++++----- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/evaluation/distr_benchmarks/oneliners/input/setup.sh b/evaluation/distr_benchmarks/oneliners/input/setup.sh index 0ea6efd6c..3d4921c22 100755 --- a/evaluation/distr_benchmarks/oneliners/input/setup.sh +++ b/evaluation/distr_benchmarks/oneliners/input/setup.sh @@ -1,15 +1,18 @@ #!/bin/bash - #set -e PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)} -REPLICATION_FACTOR=2 # another solution for capturing HTTP status code # https://superuser.com/a/590170 input_files=("1M.txt" "10M.txt" "100M.txt" "1G.txt" "all_cmds.txt" "all_cmdsx100.txt") local_fils=("dict.txt") +if [[ "$1" == "-c" ]]; then + rm -f $input_files "3G.txt" "10G.txt" + exit +fi + if [ ! -f ./1M.txt ]; then curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt if [ $? -ne 0 ]; then @@ -90,5 +93,4 @@ fi for file in "${input_files[@]}"; do hdfs dfs -Ddfs.replication=1 -put $file /rep1_$file hdfs dfs -Ddfs.replication=3 -put $file /rep3_$file - rm $file # remove local file after putting it into hdfs done \ No newline at end of file diff --git a/evaluation/distr_benchmarks/oneliners/run.distr.sh b/evaluation/distr_benchmarks/oneliners/run.distr.sh index 699a23228..dd27315e9 100755 --- a/evaluation/distr_benchmarks/oneliners/run.distr.sh +++ b/evaluation/distr_benchmarks/oneliners/run.distr.sh @@ -1,9 +1,9 @@ -PASH_FLAGS='--width 6 --r_split' +PASH_FLAGS='--width 8 --r_split' export TIMEFORMAT=%R export dict="$PASH_TOP/evaluation/benchmarks/oneliners/input/dict.txt" scripts_inputs=( - "nfa-regex;100M.txt" + "nfa-regex;1G.txt" "sort;3G.txt" "top-n;3G.txt" "wf;3G.txt" @@ -16,10 +16,10 @@ scripts_inputs=( ) oneliners_bash() { - seq_times_file="seq.res" - seq_outputs_suffix="seq.out" outputs_dir="outputs" - rep=${3:-rep3} + rep=${1:-rep3} + seq_times_file="$rep\_seq.res" + seq_outputs_suffix="$rep\_seq.out" mkdir -p "$outputs_dir" @@ -35,6 +35,7 @@ oneliners_bash() { input="${script_input_parsed[1]}" export IN="/$rep\_$input" + export dict= printf -v pad %30s padded_script="${script}${pad}" @@ -73,6 +74,7 @@ oneliners_pash(){ input="${script_input_parsed[1]}" export IN="/$rep\_$input" + export dict= printf -v pad %30s padded_script="${script}${pad}" From bc03de75187a5161b54a9adfd5a0cc7c6415e568 Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Sun, 8 May 2022 13:48:49 +0000 Subject: [PATCH 09/37] fix small bug --- evaluation/distr_benchmarks/oneliners/run.distr.sh | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/evaluation/distr_benchmarks/oneliners/run.distr.sh b/evaluation/distr_benchmarks/oneliners/run.distr.sh index dd27315e9..84913531c 100755 --- a/evaluation/distr_benchmarks/oneliners/run.distr.sh +++ b/evaluation/distr_benchmarks/oneliners/run.distr.sh @@ -1,6 +1,8 @@ PASH_FLAGS='--width 8 --r_split' export TIMEFORMAT=%R -export dict="$PASH_TOP/evaluation/benchmarks/oneliners/input/dict.txt" +export dict="$PASH_TOP/evaluation/distr_benchmarks/oneliners/input/dict.txt" +curl -sf 'http://ndr.md/data/dummy/dict.txt' | sort > $dict + scripts_inputs=( "nfa-regex;1G.txt" @@ -18,8 +20,8 @@ scripts_inputs=( oneliners_bash() { outputs_dir="outputs" rep=${1:-rep3} - seq_times_file="$rep\_seq.res" - seq_outputs_suffix="$rep\_seq.out" + seq_times_file=$rep"_seq.res" + seq_outputs_suffix=$rep"_seq.out" mkdir -p "$outputs_dir" @@ -34,7 +36,7 @@ oneliners_bash() { script="${script_input_parsed[0]}" input="${script_input_parsed[1]}" - export IN="/$rep\_$input" + export IN=/$rep\_$input export dict= printf -v pad %30s @@ -73,7 +75,7 @@ oneliners_pash(){ script="${script_input_parsed[0]}" input="${script_input_parsed[1]}" - export IN="/$rep\_$input" + export IN=/$rep\_$input export dict= printf -v pad %30s From e933a36d6e5bafb5a2a5407598300b360d02b208 Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Mon, 30 May 2022 10:55:15 -0400 Subject: [PATCH 10/37] port nlp scripts to distributed exec --- evaluation/distr_benchmarks/nlp/.gitignore | 2 + evaluation/distr_benchmarks/nlp/1_1.sh | 15 +++++ evaluation/distr_benchmarks/nlp/2_1.sh | 17 ++++++ evaluation/distr_benchmarks/nlp/2_2.sh | 16 +++++ evaluation/distr_benchmarks/nlp/3_1.sh | 16 +++++ evaluation/distr_benchmarks/nlp/3_2.sh | 16 +++++ evaluation/distr_benchmarks/nlp/3_3.sh | 16 +++++ evaluation/distr_benchmarks/nlp/4_3.sh | 19 ++++++ evaluation/distr_benchmarks/nlp/4_3b.sh | 25 ++++++++ evaluation/distr_benchmarks/nlp/6_1.sh | 27 +++++++++ evaluation/distr_benchmarks/nlp/6_1_1.sh | 16 +++++ evaluation/distr_benchmarks/nlp/6_1_2.sh | 16 +++++ evaluation/distr_benchmarks/nlp/6_2.sh | 18 ++++++ evaluation/distr_benchmarks/nlp/6_3.sh | 16 +++++ evaluation/distr_benchmarks/nlp/6_4.sh | 16 +++++ evaluation/distr_benchmarks/nlp/6_5.sh | 16 +++++ evaluation/distr_benchmarks/nlp/6_7.sh | 19 ++++++ evaluation/distr_benchmarks/nlp/7_1.sh | 16 +++++ evaluation/distr_benchmarks/nlp/7_2.sh | 16 +++++ evaluation/distr_benchmarks/nlp/8.2_1.sh | 16 +++++ evaluation/distr_benchmarks/nlp/8.2_2.sh | 26 ++++++++ evaluation/distr_benchmarks/nlp/8.3_2.sh | 24 ++++++++ evaluation/distr_benchmarks/nlp/8.3_3.sh | 25 ++++++++ evaluation/distr_benchmarks/nlp/8_1.sh | 23 ++++++++ .../distr_benchmarks/nlp/input/.gitignore | 4 ++ .../distr_benchmarks/nlp/input/setup.sh | 59 +++++++++++++++++++ 26 files changed, 495 insertions(+) create mode 100644 evaluation/distr_benchmarks/nlp/.gitignore create mode 100755 evaluation/distr_benchmarks/nlp/1_1.sh create mode 100755 evaluation/distr_benchmarks/nlp/2_1.sh create mode 100755 evaluation/distr_benchmarks/nlp/2_2.sh create mode 100755 evaluation/distr_benchmarks/nlp/3_1.sh create mode 100755 evaluation/distr_benchmarks/nlp/3_2.sh create mode 100755 evaluation/distr_benchmarks/nlp/3_3.sh create mode 100755 evaluation/distr_benchmarks/nlp/4_3.sh create mode 100755 evaluation/distr_benchmarks/nlp/4_3b.sh create mode 100755 evaluation/distr_benchmarks/nlp/6_1.sh create mode 100755 evaluation/distr_benchmarks/nlp/6_1_1.sh create mode 100755 evaluation/distr_benchmarks/nlp/6_1_2.sh create mode 100755 evaluation/distr_benchmarks/nlp/6_2.sh create mode 100755 evaluation/distr_benchmarks/nlp/6_3.sh create mode 100755 evaluation/distr_benchmarks/nlp/6_4.sh create mode 100755 evaluation/distr_benchmarks/nlp/6_5.sh create mode 100755 evaluation/distr_benchmarks/nlp/6_7.sh create mode 100755 evaluation/distr_benchmarks/nlp/7_1.sh create mode 100755 evaluation/distr_benchmarks/nlp/7_2.sh create mode 100755 evaluation/distr_benchmarks/nlp/8.2_1.sh create mode 100755 evaluation/distr_benchmarks/nlp/8.2_2.sh create mode 100755 evaluation/distr_benchmarks/nlp/8.3_2.sh create mode 100755 evaluation/distr_benchmarks/nlp/8.3_3.sh create mode 100755 evaluation/distr_benchmarks/nlp/8_1.sh create mode 100644 evaluation/distr_benchmarks/nlp/input/.gitignore create mode 100755 evaluation/distr_benchmarks/nlp/input/setup.sh diff --git a/evaluation/distr_benchmarks/nlp/.gitignore b/evaluation/distr_benchmarks/nlp/.gitignore new file mode 100644 index 000000000..1dd206e6f --- /dev/null +++ b/evaluation/distr_benchmarks/nlp/.gitignore @@ -0,0 +1,2 @@ +exodus +genesis diff --git a/evaluation/distr_benchmarks/nlp/1_1.sh b/evaluation/distr_benchmarks/nlp/1_1.sh new file mode 100755 index 000000000..7ff63c21a --- /dev/null +++ b/evaluation/distr_benchmarks/nlp/1_1.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# tag: count_words + +IN=${IN:-/nlp/pg/} +OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/1_1/} +ENTRIES=${ENTRIES:-1060} +mkdir -p "$OUT" + +for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) +do + hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort | uniq -c > ${OUT}/${input}.out +done + +echo 'done'; +rm -rf "${OUT}" diff --git a/evaluation/distr_benchmarks/nlp/2_1.sh b/evaluation/distr_benchmarks/nlp/2_1.sh new file mode 100755 index 000000000..4e35100a8 --- /dev/null +++ b/evaluation/distr_benchmarks/nlp/2_1.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# tag: merge_upper +# set -e + +# Merge upper and lower counts +IN=${IN:-/nlp/pg/} +OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/2_1/} +ENTRIES=${ENTRIES:-1060} +mkdir -p "$OUT" + +for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) +do + hdfs dfs -cat $IN/$input | tr '[a-z]' '[A-Z]' | tr -sc '[A-Z]' '[\012*]' | sort | uniq -c > ${OUT}/${input}.out +done + +echo 'done'; +rm -rf "${OUT}" diff --git a/evaluation/distr_benchmarks/nlp/2_2.sh b/evaluation/distr_benchmarks/nlp/2_2.sh new file mode 100755 index 000000000..8111b23aa --- /dev/null +++ b/evaluation/distr_benchmarks/nlp/2_2.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# tag: count_vowel_seq +# set -e + +IN=${IN:-/nlp/pg/} +OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/2_2/} +ENTRIES=${ENTRIES:-1060} +mkdir -p "$OUT" + +for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) +do + hdfs dfs -cat $IN/$input | tr 'a-z' '[A-Z]' | tr -sc 'AEIOU' '[\012*]'| sort | uniq -c > ${OUT}/${input}.out +done + +echo 'done'; +rm -rf "${OUT}" diff --git a/evaluation/distr_benchmarks/nlp/3_1.sh b/evaluation/distr_benchmarks/nlp/3_1.sh new file mode 100755 index 000000000..6082bb1c6 --- /dev/null +++ b/evaluation/distr_benchmarks/nlp/3_1.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# tag: sort +# set -e + +IN=${IN:-/nlp/pg/} +OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/3_1/} +ENTRIES=${ENTRIES:-1060} +mkdir -p "$OUT" + +for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) +do + hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort | uniq -c | sort -nr > ${OUT}/${input}.out +done + +echo 'done'; +rm -rf "${OUT}" diff --git a/evaluation/distr_benchmarks/nlp/3_2.sh b/evaluation/distr_benchmarks/nlp/3_2.sh new file mode 100755 index 000000000..571481d1d --- /dev/null +++ b/evaluation/distr_benchmarks/nlp/3_2.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# tag: sort_words_by_folding +# set -e + +IN=${IN:-/nlp/pg/} +OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/3_2/} +ENTRIES=${ENTRIES:-1060} +mkdir -p "$OUT" + +for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) +do + hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort | uniq -c | sort -f > ${OUT}/${input} +done + +echo 'done'; +rm -rf ${OUT} diff --git a/evaluation/distr_benchmarks/nlp/3_3.sh b/evaluation/distr_benchmarks/nlp/3_3.sh new file mode 100755 index 000000000..ff67ea089 --- /dev/null +++ b/evaluation/distr_benchmarks/nlp/3_3.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# tag: sort_words_by_rhyming.sh +# set -e + +IN=${IN:-/nlp/pg/} +OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/3_3/} +ENTRIES=${ENTRIES:-1060} +mkdir -p "$OUT" + +for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) +do + hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort | uniq -c | rev | sort | rev > ${OUT}/${input}.out +done + +echo 'done'; +rm -rf "${OUT}" diff --git a/evaluation/distr_benchmarks/nlp/4_3.sh b/evaluation/distr_benchmarks/nlp/4_3.sh new file mode 100755 index 000000000..c20a8cf0d --- /dev/null +++ b/evaluation/distr_benchmarks/nlp/4_3.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# tag: bigrams.sh +# set -e + +# Bigrams (contrary to our version, this uses intermediary files) +IN=${IN:-/nlp/pg/} +OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/4_3/} +ENTRIES=${ENTRIES:-1060} +mkdir -p "$OUT" + +for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) +do + hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' > ${OUT}/${input}.input.words + tail +2 ${OUT}/${input}.input.words > ${OUT}/${input}.input.nextwords + paste ${OUT}/${input}.input.words ${OUT}/${input}.input.nextwords | sort | uniq -c > ${OUT}/${input}.input.bigrams +done + +echo 'done'; +rm -rf ${OUT} diff --git a/evaluation/distr_benchmarks/nlp/4_3b.sh b/evaluation/distr_benchmarks/nlp/4_3b.sh new file mode 100755 index 000000000..1df2cdd20 --- /dev/null +++ b/evaluation/distr_benchmarks/nlp/4_3b.sh @@ -0,0 +1,25 @@ +#!/bin/bash +#tag: count_trigrams.sh +# set -e + +IN=${IN:-/nlp/pg/} +OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/4_3b/} +ENTRIES=${ENTRIES:-1060} +mkdir -p "$OUT" + +pure_func() { + input=$1 + cat > ${OUT}/${input}.words + tail +2 ${OUT}/${input}.words > ${OUT}/${input}.nextwords + tail +2 ${OUT}/${input}.words > ${OUT}/${input}.nextwords2 + paste ${OUT}/${input}.words ${OUT}/${input}.nextwords ${OUT}/${input}.nextwords2 | + sort | uniq -c +} +export -f pure_func +for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) +do + hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | pure_func $input > ${OUT}/${input}.trigrams +done + +echo 'done'; +rm -rf ${OUT} diff --git a/evaluation/distr_benchmarks/nlp/6_1.sh b/evaluation/distr_benchmarks/nlp/6_1.sh new file mode 100755 index 000000000..8d8f29220 --- /dev/null +++ b/evaluation/distr_benchmarks/nlp/6_1.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# tag: trigram_rec +# set -e + +IN=${IN:-/nlp/pg/} +OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/6_1/} +ENTRIES=${ENTRIES:-1060} +mkdir -p "$OUT" + +trigrams() { + input=$1 + tr -sc '[A-Z][a-z]' '[\012*]' > ${OUT}/${input}.words + tail +2 ${OUT}/${input}.words > ${OUT}/${input}.nextwords + tail +3 ${OUT}/${input}.words > ${OUT}/${input}.nextwords2 + paste ${OUT}/${input}.words ${OUT}/${input}.nextwords ${OUT}/${input}.nextwords2 | sort | uniq -c + rm -f ${OUT}/${input}.words ${OUT}/${input}.nextwords ${OUT}/${input}.nextwords2 +} +export -f trigrams + +for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) +do + hdfs dfs -cat $IN"/"$input | grep 'the land of' | trigrams ${input} | sort -nr | sed 5q > ${OUT}/${input}.out0 + hdfs dfs -cat $IN"/"$input | grep 'And he said' | trigrams ${input} | sort -nr | sed 5q > ${OUT}/${input}.out1 +done + +echo 'done'; +rm -rf "${OUT}" diff --git a/evaluation/distr_benchmarks/nlp/6_1_1.sh b/evaluation/distr_benchmarks/nlp/6_1_1.sh new file mode 100755 index 000000000..784e7b6a9 --- /dev/null +++ b/evaluation/distr_benchmarks/nlp/6_1_1.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# tag: uppercase_by_token +# set -e + +IN=${IN:-/nlp/pg/} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/6_1_1/} +ENTRIES=${ENTRIES:-1060} +mkdir -p "$OUT" + +for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) +do + hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | grep -c '^[A-Z]' > ${OUT}/${input}.out +done + +echo 'done'; +rm -rf ${OUT} diff --git a/evaluation/distr_benchmarks/nlp/6_1_2.sh b/evaluation/distr_benchmarks/nlp/6_1_2.sh new file mode 100755 index 000000000..779a0defb --- /dev/null +++ b/evaluation/distr_benchmarks/nlp/6_1_2.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# tag: uppercase_by_type +# set -e + +IN=${IN:-/nlp/pg/} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/6_1_2/} +ENTRIES=${ENTRIES:-1060} +mkdir -p "$OUT" + +for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) +do + hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort -u | grep -c '^[A-Z]' > ${OUT}/${input}.out +done + +echo 'done'; +rm -rf ${OUT} diff --git a/evaluation/distr_benchmarks/nlp/6_2.sh b/evaluation/distr_benchmarks/nlp/6_2.sh new file mode 100755 index 000000000..021207494 --- /dev/null +++ b/evaluation/distr_benchmarks/nlp/6_2.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# tag: four-letter words +# set -e + +# the original script has both versions +IN=${IN:-/nlp/pg/} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/6_2/} +ENTRIES=${ENTRIES:-1060} +mkdir -p "$OUT" + +for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) +do + hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | grep -c '^....$' > ${OUT}/${input}.out0 + hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort -u | grep -c '^....$' > ${OUT}/${input}.out1 +done + +echo 'done'; +rm -rf "${OUT}" diff --git a/evaluation/distr_benchmarks/nlp/6_3.sh b/evaluation/distr_benchmarks/nlp/6_3.sh new file mode 100755 index 000000000..a4f15479c --- /dev/null +++ b/evaluation/distr_benchmarks/nlp/6_3.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# tag: words_no_vowels +# set -e + +IN=${IN:-/nlp/pg/} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/6_3/} +ENTRIES=${ENTRIES:-1060} +mkdir -p "$OUT" + +for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) +do + hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | grep -vi '[aeiou]' | sort | uniq -c > ${OUT}/${input}.out +done + +echo 'done'; +rm -rf "${OUT}" diff --git a/evaluation/distr_benchmarks/nlp/6_4.sh b/evaluation/distr_benchmarks/nlp/6_4.sh new file mode 100755 index 000000000..a7727b0a5 --- /dev/null +++ b/evaluation/distr_benchmarks/nlp/6_4.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# tag: 1-syllable words +# set -e + +IN=${IN:-/nlp/pg/} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/6_4/} +ENTRIES=${ENTRIES:-1060} +mkdir -p "$OUT" + +for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) +do + hdfs dfs -cat ${IN}/${input} | tr -sc '[A-Z][a-z]' '[\012*]' | grep -i '^[^aeiou]*[aeiou][^aeiou]*$' | sort | uniq -c | sed 5q > ${OUT}/${input}.out +done + +echo 'done'; +rm -rf "${OUT}" diff --git a/evaluation/distr_benchmarks/nlp/6_5.sh b/evaluation/distr_benchmarks/nlp/6_5.sh new file mode 100755 index 000000000..413d59696 --- /dev/null +++ b/evaluation/distr_benchmarks/nlp/6_5.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# tag: 2-syllable words +# set -e + +IN=${IN:-/nlp/pg/} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/6_5/} +ENTRIES=${ENTRIES:-1060} +mkdir -p "$OUT" + +for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) +do + hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' ' [\012*]' | grep -i '^[^aeiou]*[aeiou][^aeiou]*[aeiou][^aeiou]$' | sort | uniq -c | sed 5q > ${OUT}${input}.out +done + +echo 'done'; +rm -rf "${OUT}" diff --git a/evaluation/distr_benchmarks/nlp/6_7.sh b/evaluation/distr_benchmarks/nlp/6_7.sh new file mode 100755 index 000000000..9396d23c0 --- /dev/null +++ b/evaluation/distr_benchmarks/nlp/6_7.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# tag: verse_2om_3om_2instances +# set -e +# verses with 2 or more, 3 or more, exactly 2 instances of light. + +IN=${IN:-/nlp/pg/} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/6_7/} +ENTRIES=${ENTRIES:-1060} +mkdir -p "$OUT" + +for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) +do + hdfs dfs -cat $IN/$input | grep -c 'light.\*light' > ${OUT}/${input}.out0 + hdfs dfs -cat $IN/$input | grep -c 'light.\*light.\*light' > ${OUT}/${input}.out1 + hdfs dfs -cat $IN/$input | grep 'light.\*light' | grep -vc 'light.\*light.\*light' > ${OUT}/${input}.out2 +done + +echo 'done'; +rm -rf ${OUT} diff --git a/evaluation/distr_benchmarks/nlp/7_1.sh b/evaluation/distr_benchmarks/nlp/7_1.sh new file mode 100755 index 000000000..7f3f81518 --- /dev/null +++ b/evaluation/distr_benchmarks/nlp/7_1.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# tag: count_morphs +# set -e + +IN=${IN:-/nlp/pg/} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/7_1/} +ENTRIES=${ENTRIES:-1060} +mkdir -p "$OUT" + +for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) +do + hdfs dfs -cat $IN/$input | sed 's/ly$/-ly/g' | sed 's/ .*//g' | sort | uniq -c > ${OUT}/${input}.out +done + +echo 'done'; +rm ${OUT} diff --git a/evaluation/distr_benchmarks/nlp/7_2.sh b/evaluation/distr_benchmarks/nlp/7_2.sh new file mode 100755 index 000000000..7ba0e1b38 --- /dev/null +++ b/evaluation/distr_benchmarks/nlp/7_2.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# set -e +# tag: count_consonant_sequences + +IN=${IN:-/nlp/pg/} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/7_2/} +ENTRIES=${ENTRIES:-1060} +mkdir -p "$OUT" + +for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) +do + hdfs dfs -cat $IN/$input | tr '[a-z]' '[A-Z]' | tr -sc 'BCDFGHJKLMNPQRSTVWXYZ' '[\012*]' | sort | uniq -c > ${OUT}/${input}.out +done + +echo 'done'; +rm -rf ${OUT} diff --git a/evaluation/distr_benchmarks/nlp/8.2_1.sh b/evaluation/distr_benchmarks/nlp/8.2_1.sh new file mode 100755 index 000000000..94bc2a383 --- /dev/null +++ b/evaluation/distr_benchmarks/nlp/8.2_1.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# tag: vowel_sequences_gr_1K.sh +# set -e + +IN=${IN:-/nlp/pg/} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/8.2_1/} +ENTRIES=${ENTRIES:-1060} +mkdir -p "$OUT" + +for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) +do + hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | tr -sc 'AEIOUaeiou' '[\012*]' | sort | uniq -c | awk "\$1 >= 1000" > ${OUT}/${input}.out +done + +echo 'done'; +rm -rf "${OUT}" diff --git a/evaluation/distr_benchmarks/nlp/8.2_2.sh b/evaluation/distr_benchmarks/nlp/8.2_2.sh new file mode 100755 index 000000000..3ac31555d --- /dev/null +++ b/evaluation/distr_benchmarks/nlp/8.2_2.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# tag: bigrams_appear_twice.sh +# set -e + +# Calculate the bigrams (based on 4_3.sh script) +IN=${IN:-/nlp/pg/} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/8.2_2/} +ENTRIES=${ENTRIES:-1060} +mkdir -p "$OUT" + +pure_func() { + input=$1 + cat > ${OUT}/${input}.input.words + tail +2 ${OUT}/${input}.input.words > ${OUT}/${input}.input.nextwords + paste ${OUT}/${input}.input.words ${OUT}/${input}.input.nextwords | sort | uniq -c > ${OUT}/${input}.input.bigrams + awk "\$1 == 2 {print \$2, \$3}" ${OUT}/${input}.input.bigrams +} + +export -f pure_func +for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) +do + hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | pure_func $input > ${OUT}/${input}.out +done + +echo 'done'; +rm -rf "${OUT}" diff --git a/evaluation/distr_benchmarks/nlp/8.3_2.sh b/evaluation/distr_benchmarks/nlp/8.3_2.sh new file mode 100755 index 000000000..a60da077c --- /dev/null +++ b/evaluation/distr_benchmarks/nlp/8.3_2.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# tag: find_anagrams.sh +# set -e + +IN=${IN:-/nlp/pg/} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/8.3_2/} +ENTRIES=${ENTRIES:-1060} +mkdir -p "$OUT" + +pure_func() { + input=$1 + sort -u > ${OUT}/${input}.types + rev < ${OUT}/${input}.types > ${OUT}/${input}.types.rev + sort ${OUT}/${input}.types ${OUT}/${input}.types.rev | uniq -c | awk "\$1 >= 2 {print \$2}" +} + +export -f pure_func +for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) +do + hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | pure_func $input > ${OUT}/${input}.out +done + +echo 'done'; +rm -rf "${OUT}" diff --git a/evaluation/distr_benchmarks/nlp/8.3_3.sh b/evaluation/distr_benchmarks/nlp/8.3_3.sh new file mode 100755 index 000000000..e397fb939 --- /dev/null +++ b/evaluation/distr_benchmarks/nlp/8.3_3.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# tag: compare_exodus_genesis.sh +# set -e + +IN=${IN:-/nlp/pg/} +INPUT2=${INPUT2:-$PASH_TOP/evaluation/distr_benchmarks/nlp/input/exodus} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/8.3_3/} +ENTRIES=${ENTRIES:-1060} +mkdir -p $OUT + +pure_func() { + input=$1 + cat > ${OUT}/${input}1.types + hdfs dfs -cat ${INPUT2} | tr -sc '[A-Z][a-z]' '[\012*]' | sort -u > ${OUT}/${input}2.types + sort $OUT/${input}1.types ${OUT}/${input}2.types ${OUT}/${input}2.types | uniq -c | head + +} +export -f pure_func +for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) +do + hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort -u | pure_func $input > ${OUT}/${input}.out +done + +echo 'done'; +rm -rf "${OUT}" diff --git a/evaluation/distr_benchmarks/nlp/8_1.sh b/evaluation/distr_benchmarks/nlp/8_1.sh new file mode 100755 index 000000000..b274e7946 --- /dev/null +++ b/evaluation/distr_benchmarks/nlp/8_1.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# tag: sort_words_by_num_of_syllables +# set -e + +IN=${IN:-/nlp/pg/} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/8.1/} +ENTRIES=${ENTRIES:-1060} +mkdir -p "$OUT" + +pure_func() { + input=$1 + cat > ${OUT}/${input}.words + tr -sc '[AEIOUaeiou\012]' ' ' < ${OUT}/${input}.words | awk '{print NF}' > ${OUT}/${input}.syl + paste ${OUT}/${input}.syl ${OUT}/${input}.words | sort -nr | sed 5q +} +export -f pure_func +for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) +do + hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort -u | pure_func $input > ${OUT}/${input}.out +done + +echo 'done'; +rm -rf "${OUT}" diff --git a/evaluation/distr_benchmarks/nlp/input/.gitignore b/evaluation/distr_benchmarks/nlp/input/.gitignore new file mode 100644 index 000000000..d815bc4e8 --- /dev/null +++ b/evaluation/distr_benchmarks/nlp/input/.gitignore @@ -0,0 +1,4 @@ +* +!pipelines.sh +!setup.sh +!.gitignore diff --git a/evaluation/distr_benchmarks/nlp/input/setup.sh b/evaluation/distr_benchmarks/nlp/input/setup.sh new file mode 100755 index 000000000..1875bbb8a --- /dev/null +++ b/evaluation/distr_benchmarks/nlp/input/setup.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)} + +[[ "$1" == "-c" ]] && { rm -rf genesis exodus pg; exit; } + +setup_dataset() { + if [ ! -f ./genesis ]; then + curl -sf https://www.gutenberg.org/cache/epub/8001/pg8001.txt > genesis + "$PASH_TOP/scripts/append_nl_if_not.sh" genesis + fi + + if [ ! -f ./exodus ]; then + curl -sf https://www.gutenberg.org/files/33420/33420-0.txt > exodus + "$PASH_TOP/scripts/append_nl_if_not.sh" exodus + fi + + if [ ! -e ./pg ]; then + mkdir pg + cd pg + if [[ "$1" == "--gen-full" ]]; then + echo 'N.b.: download/extraction will take about 10min' + wget ndr.md/data/pg.tar.xz + if [ $? -ne 0 ]; then + cat <<-'EOF' | sed 's/^ *//' + Downloading input dataset failed, thus need to manually rsync all books from project gutenberg: + rsync -av --del --prune-empty-dirs --include='*.txt' --include='*/' --exclude='*' ftp@ftp.ibiblio.org::gutenberg . + please contact the pash developers pash-devs@googlegroups.com + EOF + exit 1 + fi + cat pg.tar.xz | tar -xJ + else + wget http://pac-n4.csail.mit.edu:81/pash_data/nlp.zip + unzip nlp.zip + mv data/* . + rm nlp.zip data -rf + fi + for f in *.txt; do + "$PASH_TOP/scripts/append_nl_if_not.sh" $f + done + cd .. + fi + + # Put files in hdfs + hdfs dfs -mkdir /nlp + hdfs dfs -put exodus /nlp/exodus + hdfs dfs -put genesis /nlp/genesis + hdfs dfs -put pg /nlp/pg +} + +source_var() { + if [[ "$1" == "--small" ]]; then + export ENTRIES=40 + else + # 1% of the input + export ENTRIES=1060 + fi +} From 3f65e428be9ade33e23ff95af4581d2baffce8b0 Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Mon, 30 May 2022 13:56:05 -0400 Subject: [PATCH 11/37] replace non parallelizable tr with parallelizable variation --- evaluation/distr_benchmarks/nlp/1_1.sh | 4 ++-- evaluation/distr_benchmarks/nlp/2_1.sh | 4 ++-- evaluation/distr_benchmarks/nlp/2_2.sh | 2 +- evaluation/distr_benchmarks/nlp/3_1.sh | 4 ++-- evaluation/distr_benchmarks/nlp/3_2.sh | 4 ++-- evaluation/distr_benchmarks/nlp/3_3.sh | 4 ++-- evaluation/distr_benchmarks/nlp/4_3.sh | 4 ++-- evaluation/distr_benchmarks/nlp/4_3b.sh | 4 ++-- evaluation/distr_benchmarks/nlp/6_1.sh | 2 +- evaluation/distr_benchmarks/nlp/6_1_1.sh | 2 +- evaluation/distr_benchmarks/nlp/6_1_2.sh | 2 +- evaluation/distr_benchmarks/nlp/6_2.sh | 4 ++-- evaluation/distr_benchmarks/nlp/6_3.sh | 2 +- evaluation/distr_benchmarks/nlp/6_4.sh | 2 +- evaluation/distr_benchmarks/nlp/6_5.sh | 2 +- evaluation/distr_benchmarks/nlp/8.2_1.sh | 2 +- evaluation/distr_benchmarks/nlp/8.2_2.sh | 2 +- evaluation/distr_benchmarks/nlp/8.3_2.sh | 2 +- evaluation/distr_benchmarks/nlp/8.3_3.sh | 2 +- evaluation/distr_benchmarks/nlp/8_1.sh | 2 +- evaluation/distr_benchmarks/oneliners/top-n.sh | 2 +- evaluation/distr_benchmarks/oneliners/wf.sh | 4 ++-- 22 files changed, 31 insertions(+), 31 deletions(-) diff --git a/evaluation/distr_benchmarks/nlp/1_1.sh b/evaluation/distr_benchmarks/nlp/1_1.sh index 7ff63c21a..50aa77bbd 100755 --- a/evaluation/distr_benchmarks/nlp/1_1.sh +++ b/evaluation/distr_benchmarks/nlp/1_1.sh @@ -2,13 +2,13 @@ # tag: count_words IN=${IN:-/nlp/pg/} -OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/1_1/} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/1_1/} ENTRIES=${ENTRIES:-1060} mkdir -p "$OUT" for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) do - hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort | uniq -c > ${OUT}/${input}.out + hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | sort | uniq -c > ${OUT}/${input}.out done echo 'done'; diff --git a/evaluation/distr_benchmarks/nlp/2_1.sh b/evaluation/distr_benchmarks/nlp/2_1.sh index 4e35100a8..b89d2f48f 100755 --- a/evaluation/distr_benchmarks/nlp/2_1.sh +++ b/evaluation/distr_benchmarks/nlp/2_1.sh @@ -4,13 +4,13 @@ # Merge upper and lower counts IN=${IN:-/nlp/pg/} -OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/2_1/} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/2_1/} ENTRIES=${ENTRIES:-1060} mkdir -p "$OUT" for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) do - hdfs dfs -cat $IN/$input | tr '[a-z]' '[A-Z]' | tr -sc '[A-Z]' '[\012*]' | sort | uniq -c > ${OUT}/${input}.out + hdfs dfs -cat $IN/$input | tr '[a-z]' '[A-Z]' | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | sort | uniq -c > ${OUT}/${input}.out done echo 'done'; diff --git a/evaluation/distr_benchmarks/nlp/2_2.sh b/evaluation/distr_benchmarks/nlp/2_2.sh index 8111b23aa..39d8e9b0c 100755 --- a/evaluation/distr_benchmarks/nlp/2_2.sh +++ b/evaluation/distr_benchmarks/nlp/2_2.sh @@ -3,7 +3,7 @@ # set -e IN=${IN:-/nlp/pg/} -OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/2_2/} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/2_2/} ENTRIES=${ENTRIES:-1060} mkdir -p "$OUT" diff --git a/evaluation/distr_benchmarks/nlp/3_1.sh b/evaluation/distr_benchmarks/nlp/3_1.sh index 6082bb1c6..2a58b2861 100755 --- a/evaluation/distr_benchmarks/nlp/3_1.sh +++ b/evaluation/distr_benchmarks/nlp/3_1.sh @@ -3,13 +3,13 @@ # set -e IN=${IN:-/nlp/pg/} -OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/3_1/} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/3_1/} ENTRIES=${ENTRIES:-1060} mkdir -p "$OUT" for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) do - hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort | uniq -c | sort -nr > ${OUT}/${input}.out + hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -nr > ${OUT}/${input}.out done echo 'done'; diff --git a/evaluation/distr_benchmarks/nlp/3_2.sh b/evaluation/distr_benchmarks/nlp/3_2.sh index 571481d1d..51d55ffdc 100755 --- a/evaluation/distr_benchmarks/nlp/3_2.sh +++ b/evaluation/distr_benchmarks/nlp/3_2.sh @@ -3,13 +3,13 @@ # set -e IN=${IN:-/nlp/pg/} -OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/3_2/} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/3_2/} ENTRIES=${ENTRIES:-1060} mkdir -p "$OUT" for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) do - hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort | uniq -c | sort -f > ${OUT}/${input} + hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -f > ${OUT}/${input} done echo 'done'; diff --git a/evaluation/distr_benchmarks/nlp/3_3.sh b/evaluation/distr_benchmarks/nlp/3_3.sh index ff67ea089..909e5a4bd 100755 --- a/evaluation/distr_benchmarks/nlp/3_3.sh +++ b/evaluation/distr_benchmarks/nlp/3_3.sh @@ -3,13 +3,13 @@ # set -e IN=${IN:-/nlp/pg/} -OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/3_3/} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/3_3/} ENTRIES=${ENTRIES:-1060} mkdir -p "$OUT" for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) do - hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort | uniq -c | rev | sort | rev > ${OUT}/${input}.out + hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | rev | sort | rev > ${OUT}/${input}.out done echo 'done'; diff --git a/evaluation/distr_benchmarks/nlp/4_3.sh b/evaluation/distr_benchmarks/nlp/4_3.sh index c20a8cf0d..100c78918 100755 --- a/evaluation/distr_benchmarks/nlp/4_3.sh +++ b/evaluation/distr_benchmarks/nlp/4_3.sh @@ -4,13 +4,13 @@ # Bigrams (contrary to our version, this uses intermediary files) IN=${IN:-/nlp/pg/} -OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/4_3/} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/4_3/} ENTRIES=${ENTRIES:-1060} mkdir -p "$OUT" for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) do - hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' > ${OUT}/${input}.input.words + hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" > ${OUT}/${input}.input.words tail +2 ${OUT}/${input}.input.words > ${OUT}/${input}.input.nextwords paste ${OUT}/${input}.input.words ${OUT}/${input}.input.nextwords | sort | uniq -c > ${OUT}/${input}.input.bigrams done diff --git a/evaluation/distr_benchmarks/nlp/4_3b.sh b/evaluation/distr_benchmarks/nlp/4_3b.sh index 1df2cdd20..a77f9dd26 100755 --- a/evaluation/distr_benchmarks/nlp/4_3b.sh +++ b/evaluation/distr_benchmarks/nlp/4_3b.sh @@ -3,7 +3,7 @@ # set -e IN=${IN:-/nlp/pg/} -OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/4_3b/} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/4_3b/} ENTRIES=${ENTRIES:-1060} mkdir -p "$OUT" @@ -18,7 +18,7 @@ pure_func() { export -f pure_func for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) do - hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | pure_func $input > ${OUT}/${input}.trigrams + hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | pure_func $input > ${OUT}/${input}.trigrams done echo 'done'; diff --git a/evaluation/distr_benchmarks/nlp/6_1.sh b/evaluation/distr_benchmarks/nlp/6_1.sh index 8d8f29220..39c328c20 100755 --- a/evaluation/distr_benchmarks/nlp/6_1.sh +++ b/evaluation/distr_benchmarks/nlp/6_1.sh @@ -3,7 +3,7 @@ # set -e IN=${IN:-/nlp/pg/} -OUT=${OUT:-$PASH_TOP/evaluation/benchmarks/nlp/output/6_1/} +OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/6_1/} ENTRIES=${ENTRIES:-1060} mkdir -p "$OUT" diff --git a/evaluation/distr_benchmarks/nlp/6_1_1.sh b/evaluation/distr_benchmarks/nlp/6_1_1.sh index 784e7b6a9..c92af69ee 100755 --- a/evaluation/distr_benchmarks/nlp/6_1_1.sh +++ b/evaluation/distr_benchmarks/nlp/6_1_1.sh @@ -9,7 +9,7 @@ mkdir -p "$OUT" for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) do - hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | grep -c '^[A-Z]' > ${OUT}/${input}.out + hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | grep -c '^[A-Z]' > ${OUT}/${input}.out done echo 'done'; diff --git a/evaluation/distr_benchmarks/nlp/6_1_2.sh b/evaluation/distr_benchmarks/nlp/6_1_2.sh index 779a0defb..72041d3e1 100755 --- a/evaluation/distr_benchmarks/nlp/6_1_2.sh +++ b/evaluation/distr_benchmarks/nlp/6_1_2.sh @@ -9,7 +9,7 @@ mkdir -p "$OUT" for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) do - hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort -u | grep -c '^[A-Z]' > ${OUT}/${input}.out + hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | sort -u | grep -c '^[A-Z]' > ${OUT}/${input}.out done echo 'done'; diff --git a/evaluation/distr_benchmarks/nlp/6_2.sh b/evaluation/distr_benchmarks/nlp/6_2.sh index 021207494..5227daffe 100755 --- a/evaluation/distr_benchmarks/nlp/6_2.sh +++ b/evaluation/distr_benchmarks/nlp/6_2.sh @@ -10,8 +10,8 @@ mkdir -p "$OUT" for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) do - hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | grep -c '^....$' > ${OUT}/${input}.out0 - hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort -u | grep -c '^....$' > ${OUT}/${input}.out1 + hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | grep -c '^....$' > ${OUT}/${input}.out0 + hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | sort -u | grep -c '^....$' > ${OUT}/${input}.out1 done echo 'done'; diff --git a/evaluation/distr_benchmarks/nlp/6_3.sh b/evaluation/distr_benchmarks/nlp/6_3.sh index a4f15479c..699c3eafd 100755 --- a/evaluation/distr_benchmarks/nlp/6_3.sh +++ b/evaluation/distr_benchmarks/nlp/6_3.sh @@ -9,7 +9,7 @@ mkdir -p "$OUT" for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) do - hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | grep -vi '[aeiou]' | sort | uniq -c > ${OUT}/${input}.out + hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | grep -vi '[aeiou]' | sort | uniq -c > ${OUT}/${input}.out done echo 'done'; diff --git a/evaluation/distr_benchmarks/nlp/6_4.sh b/evaluation/distr_benchmarks/nlp/6_4.sh index a7727b0a5..bd47e042c 100755 --- a/evaluation/distr_benchmarks/nlp/6_4.sh +++ b/evaluation/distr_benchmarks/nlp/6_4.sh @@ -9,7 +9,7 @@ mkdir -p "$OUT" for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) do - hdfs dfs -cat ${IN}/${input} | tr -sc '[A-Z][a-z]' '[\012*]' | grep -i '^[^aeiou]*[aeiou][^aeiou]*$' | sort | uniq -c | sed 5q > ${OUT}/${input}.out + hdfs dfs -cat ${IN}/${input} | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | grep -i '^[^aeiou]*[aeiou][^aeiou]*$' | sort | uniq -c | sed 5q > ${OUT}/${input}.out done echo 'done'; diff --git a/evaluation/distr_benchmarks/nlp/6_5.sh b/evaluation/distr_benchmarks/nlp/6_5.sh index 413d59696..90d65e4a9 100755 --- a/evaluation/distr_benchmarks/nlp/6_5.sh +++ b/evaluation/distr_benchmarks/nlp/6_5.sh @@ -9,7 +9,7 @@ mkdir -p "$OUT" for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) do - hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' ' [\012*]' | grep -i '^[^aeiou]*[aeiou][^aeiou]*[aeiou][^aeiou]$' | sort | uniq -c | sed 5q > ${OUT}${input}.out + hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | grep -i '^[^aeiou]*[aeiou][^aeiou]*[aeiou][^aeiou]$' | sort | uniq -c | sed 5q > ${OUT}${input}.out done echo 'done'; diff --git a/evaluation/distr_benchmarks/nlp/8.2_1.sh b/evaluation/distr_benchmarks/nlp/8.2_1.sh index 94bc2a383..f03a56985 100755 --- a/evaluation/distr_benchmarks/nlp/8.2_1.sh +++ b/evaluation/distr_benchmarks/nlp/8.2_1.sh @@ -9,7 +9,7 @@ mkdir -p "$OUT" for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) do - hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | tr -sc 'AEIOUaeiou' '[\012*]' | sort | uniq -c | awk "\$1 >= 1000" > ${OUT}/${input}.out + hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | tr -sc 'AEIOUaeiou' '[\012*]' | sort | uniq -c | awk "\$1 >= 1000" > ${OUT}/${input}.out done echo 'done'; diff --git a/evaluation/distr_benchmarks/nlp/8.2_2.sh b/evaluation/distr_benchmarks/nlp/8.2_2.sh index 3ac31555d..dc6ec685b 100755 --- a/evaluation/distr_benchmarks/nlp/8.2_2.sh +++ b/evaluation/distr_benchmarks/nlp/8.2_2.sh @@ -19,7 +19,7 @@ pure_func() { export -f pure_func for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) do - hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | pure_func $input > ${OUT}/${input}.out + hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | pure_func $input > ${OUT}/${input}.out done echo 'done'; diff --git a/evaluation/distr_benchmarks/nlp/8.3_2.sh b/evaluation/distr_benchmarks/nlp/8.3_2.sh index a60da077c..47454d3b8 100755 --- a/evaluation/distr_benchmarks/nlp/8.3_2.sh +++ b/evaluation/distr_benchmarks/nlp/8.3_2.sh @@ -17,7 +17,7 @@ pure_func() { export -f pure_func for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) do - hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | pure_func $input > ${OUT}/${input}.out + hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | pure_func $input > ${OUT}/${input}.out done echo 'done'; diff --git a/evaluation/distr_benchmarks/nlp/8.3_3.sh b/evaluation/distr_benchmarks/nlp/8.3_3.sh index e397fb939..22dfe96c3 100755 --- a/evaluation/distr_benchmarks/nlp/8.3_3.sh +++ b/evaluation/distr_benchmarks/nlp/8.3_3.sh @@ -18,7 +18,7 @@ pure_func() { export -f pure_func for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) do - hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort -u | pure_func $input > ${OUT}/${input}.out + hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | sort -u | pure_func $input > ${OUT}/${input}.out done echo 'done'; diff --git a/evaluation/distr_benchmarks/nlp/8_1.sh b/evaluation/distr_benchmarks/nlp/8_1.sh index b274e7946..7973476ba 100755 --- a/evaluation/distr_benchmarks/nlp/8_1.sh +++ b/evaluation/distr_benchmarks/nlp/8_1.sh @@ -16,7 +16,7 @@ pure_func() { export -f pure_func for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) do - hdfs dfs -cat $IN/$input | tr -sc '[A-Z][a-z]' '[\012*]' | sort -u | pure_func $input > ${OUT}/${input}.out + hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | sort -u | pure_func $input > ${OUT}/${input}.out done echo 'done'; diff --git a/evaluation/distr_benchmarks/oneliners/top-n.sh b/evaluation/distr_benchmarks/oneliners/top-n.sh index ac6fbb50e..c2f7f2b21 100755 --- a/evaluation/distr_benchmarks/oneliners/top-n.sh +++ b/evaluation/distr_benchmarks/oneliners/top-n.sh @@ -4,5 +4,5 @@ IN=${IN:-/1G.txt} -hdfs dfs -cat $IN | tr -cs A-Za-z '\n' | tr A-Z a-z | sort | uniq -c | sort -rn | sed 100q +hdfs dfs -cat $IN | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | tr A-Z a-z | sort | uniq -c | sort -rn | sed 100q diff --git a/evaluation/distr_benchmarks/oneliners/wf.sh b/evaluation/distr_benchmarks/oneliners/wf.sh index a8a885775..eea9c2c58 100755 --- a/evaluation/distr_benchmarks/oneliners/wf.sh +++ b/evaluation/distr_benchmarks/oneliners/wf.sh @@ -1,6 +1,6 @@ #!/bin/bash # Calculate the frequency of each word in the document, and sort by frequency -IN=${IN:-/10M.txt} +IN=${IN:-/rep3_10M.txt} -hdfs dfs -cat $IN | tr -cs A-Za-z '\n' | tr A-Z a-z | sort | uniq -c | sort -rn +hdfs dfs -cat $IN | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | tr A-Z a-z | sort | uniq -c | sort -rn From 9d11dd64e68c6a6767d06265d8955226780b1d0b Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Mon, 30 May 2022 15:14:42 -0400 Subject: [PATCH 12/37] nlp eval script --- evaluation/distr_benchmarks/nlp/8.3_3.sh | 2 +- evaluation/distr_benchmarks/nlp/run.distr.sh | 112 +++++++++++++++++++ 2 files changed, 113 insertions(+), 1 deletion(-) create mode 100755 evaluation/distr_benchmarks/nlp/run.distr.sh diff --git a/evaluation/distr_benchmarks/nlp/8.3_3.sh b/evaluation/distr_benchmarks/nlp/8.3_3.sh index 22dfe96c3..71b873a21 100755 --- a/evaluation/distr_benchmarks/nlp/8.3_3.sh +++ b/evaluation/distr_benchmarks/nlp/8.3_3.sh @@ -3,7 +3,7 @@ # set -e IN=${IN:-/nlp/pg/} -INPUT2=${INPUT2:-$PASH_TOP/evaluation/distr_benchmarks/nlp/input/exodus} +INPUT2=${INPUT2:-/nlp/exodus} OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/8.3_3/} ENTRIES=${ENTRIES:-1060} mkdir -p $OUT diff --git a/evaluation/distr_benchmarks/nlp/run.distr.sh b/evaluation/distr_benchmarks/nlp/run.distr.sh new file mode 100755 index 000000000..48dba3fa6 --- /dev/null +++ b/evaluation/distr_benchmarks/nlp/run.distr.sh @@ -0,0 +1,112 @@ +PASH_FLAGS='--width 8 --r_split --parallel_pipelines' +export TIMEFORMAT=%R + +if [[ "$1" == "--small" ]]; then + echo "Using small input" + export ENTRIES=40 +else + echo "Using full input" + export ENTRIES=1060 +fi + +names_scripts=( + "1syllable_words;6_4" + "2syllable_words;6_5" + "4letter_words;6_2" + "bigrams_appear_twice;8.2_2" + "bigrams;4_3" + "compare_exodus_genesis;8.3_3" + "count_consonant_seq;7_2" + # "count_morphs;7_1" + "count_trigrams;4_3b" + "count_vowel_seq;2_2" + "count_words;1_1" + "find_anagrams;8.3_2" + "merge_upper;2_1" + "sort;3_1" + "sort_words_by_folding;3_2" + "sort_words_by_num_of_syllables;8_1" + "sort_words_by_rhyming;3_3" + # "trigram_rec;6_1" + "uppercase_by_token;6_1_1" + "uppercase_by_type;6_1_2" + "verses_2om_3om_2instances;6_7" + "vowel_sequencies_gr_1K;8.2_1" + "words_no_vowels;6_3" + ) + +bash_nlp(){ + outputs_dir="outputs" + rep=${1:-rep3} + times_file=$rep"_seq.res" + outputs_suffix=$rep"_seq.out" + + mkdir -p "$outputs_dir" + + touch "$times_file" + echo executing Unix-for-nlp $(date) | tee -a "$times_file" + echo '' >> "$times_file" + + for name_script in ${names_scripts[@]} + do + IFS=";" read -r -a name_script_parsed <<< "${name_script}" + name="${name_script_parsed[0]}" + script="${name_script_parsed[1]}" + printf -v pad %30s + padded_script="${name}.sh:${pad}" + padded_script=${padded_script:0:30} + + outputs_file="${outputs_dir}/${script}.${outputs_suffix}" + + echo "${padded_script}" $({ time ./${script}.sh > "$outputs_file"; } 2>&1) | tee -a "$times_file" + done + cd .. +} + +nlp_pash(){ + flags=${1:-$PASH_FLAGS} + prefix=${2:-par} + rep=${3:-rep3} + prefix=$prefix\_$rep + + times_file="$prefix.res" + outputs_suffix="$prefix.out" + time_suffix="$prefix.time" + outputs_dir="outputs" + pash_logs_dir="pash_logs_$prefix" + + mkdir -p "$outputs_dir" + mkdir -p "$pash_logs_dir" + + touch "$times_file" + echo executing Unix-for-nlp with pash $(date) | tee -a "$times_file" + echo '' >> "$times_file" + + for name_script in ${names_scripts[@]} + do + IFS=";" read -r -a name_script_parsed <<< "${name_script}" + name="${name_script_parsed[0]}" + script="${name_script_parsed[1]}" + printf -v pad %30s + padded_script="${name}.sh:${pad}" + padded_script=${padded_script:0:30} + + outputs_file="${outputs_dir}/${script}.${outputs_suffix}" + pash_log="${pash_logs_dir}/${script}.pash.log" + single_time_file="${outputs_dir}/${script}.${time_suffix}" + + echo -n "${padded_script}" | tee -a "$times_file" + { time "$PASH_TOP/pa.sh" $PASH_FLAGS --log_file "${pash_log}" ${script}.sh > "$outputs_file"; } 2> "${single_time_file}" + cat "${single_time_file}" | tee -a "$times_file" + done + cd .. +} + +# bash_nlp "rep1" +bash_nlp "rep3" + +# nlp_pash "$PASH_FLAGS" "par" "rep1" +nlp_pash "$PASH_FLAGS --parallel_pipelines_limit 6" "par" "rep3" + +# nlp_pash "$PASH_FLAGS --distributed_exec" "distr" "rep1" +nlp_pash "$PASH_FLAGS --distributed_exec --parallel_pipelines_limit 24" "distr" "rep3" From eecff28b6165143d6e4c24db90609dc2d7c90f35 Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Mon, 30 May 2022 15:41:44 -0400 Subject: [PATCH 13/37] fix incorrect flags --- evaluation/distr_benchmarks/nlp/run.distr.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evaluation/distr_benchmarks/nlp/run.distr.sh b/evaluation/distr_benchmarks/nlp/run.distr.sh index 48dba3fa6..58b84481c 100755 --- a/evaluation/distr_benchmarks/nlp/run.distr.sh +++ b/evaluation/distr_benchmarks/nlp/run.distr.sh @@ -96,7 +96,7 @@ nlp_pash(){ single_time_file="${outputs_dir}/${script}.${time_suffix}" echo -n "${padded_script}" | tee -a "$times_file" - { time "$PASH_TOP/pa.sh" $PASH_FLAGS --log_file "${pash_log}" ${script}.sh > "$outputs_file"; } 2> "${single_time_file}" + { time "$PASH_TOP/pa.sh" $flags --log_file "${pash_log}" ${script}.sh > "$outputs_file"; } 2> "${single_time_file}" cat "${single_time_file}" | tee -a "$times_file" done cd .. @@ -106,7 +106,7 @@ nlp_pash(){ bash_nlp "rep3" # nlp_pash "$PASH_FLAGS" "par" "rep1" -nlp_pash "$PASH_FLAGS --parallel_pipelines_limit 6" "par" "rep3" +nlp_pash "$PASH_FLAGS --parallel_pipelines_limit 8" "par" "rep3" # nlp_pash "$PASH_FLAGS --distributed_exec" "distr" "rep1" nlp_pash "$PASH_FLAGS --distributed_exec --parallel_pipelines_limit 24" "distr" "rep3" From 4749aa4df02ac050629fc5c5d7f075170939e6c3 Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Thu, 2 Jun 2022 09:54:45 -0400 Subject: [PATCH 14/37] fixed small issues in eval scripts --- evaluation/distr_benchmarks/nlp/run.distr.sh | 42 +++++++++---------- .../distr_benchmarks/oneliners/run.distr.sh | 14 +++---- evaluation/distr_benchmarks/run_all.sh | 5 +++ 3 files changed, 32 insertions(+), 29 deletions(-) create mode 100755 evaluation/distr_benchmarks/run_all.sh diff --git a/evaluation/distr_benchmarks/nlp/run.distr.sh b/evaluation/distr_benchmarks/nlp/run.distr.sh index 58b84481c..b693d5065 100755 --- a/evaluation/distr_benchmarks/nlp/run.distr.sh +++ b/evaluation/distr_benchmarks/nlp/run.distr.sh @@ -1,12 +1,12 @@ -PASH_FLAGS='--width 8 --r_split --parallel_pipelines' +PASH_FLAGS='--width 8 --r_split' export TIMEFORMAT=%R -if [[ "$1" == "--small" ]]; then - echo "Using small input" - export ENTRIES=40 +if [[ "$1" == "--full" ]]; then + echo "Using full input" + export ENTRIES=1060 else - echo "Using full input" - export ENTRIES=1060 + echo "Using small input" + export ENTRIES=120 fi names_scripts=( @@ -14,7 +14,7 @@ names_scripts=( "2syllable_words;6_5" "4letter_words;6_2" "bigrams_appear_twice;8.2_2" - "bigrams;4_3" + # "bigrams;4_3" "compare_exodus_genesis;8.3_3" "count_consonant_seq;7_2" # "count_morphs;7_1" @@ -37,14 +37,14 @@ names_scripts=( bash_nlp(){ outputs_dir="outputs" - rep=${1:-rep3} - times_file=$rep"_seq.res" - outputs_suffix=$rep"_seq.out" + times_file="seq.res" + outputs_suffix="seq.out" mkdir -p "$outputs_dir" touch "$times_file" - echo executing Unix-for-nlp $(date) | tee -a "$times_file" + cat "$times_file" > "$times_file".d + echo executing Unix-for-nlp $(date) | tee "$times_file" echo '' >> "$times_file" for name_script in ${names_scripts[@]} @@ -60,14 +60,11 @@ bash_nlp(){ echo "${padded_script}" $({ time ./${script}.sh > "$outputs_file"; } 2>&1) | tee -a "$times_file" done - cd .. } nlp_pash(){ flags=${1:-$PASH_FLAGS} prefix=${2:-par} - rep=${3:-rep3} - prefix=$prefix\_$rep times_file="$prefix.res" outputs_suffix="$prefix.out" @@ -79,7 +76,8 @@ nlp_pash(){ mkdir -p "$pash_logs_dir" touch "$times_file" - echo executing Unix-for-nlp with pash $(date) | tee -a "$times_file" + cat "$times_file" > "$times_file".d + echo executing Unix-for-nlp with $prefix pash $(date) | tee "$times_file" echo '' >> "$times_file" for name_script in ${names_scripts[@]} @@ -99,14 +97,14 @@ nlp_pash(){ { time "$PASH_TOP/pa.sh" $flags --log_file "${pash_log}" ${script}.sh > "$outputs_file"; } 2> "${single_time_file}" cat "${single_time_file}" | tee -a "$times_file" done - cd .. } -# bash_nlp "rep1" -bash_nlp "rep3" +bash_nlp -# nlp_pash "$PASH_FLAGS" "par" "rep1" -nlp_pash "$PASH_FLAGS --parallel_pipelines_limit 8" "par" "rep3" +nlp_pash "$PASH_FLAGS" "par_no_du" -# nlp_pash "$PASH_FLAGS --distributed_exec" "distr" "rep1" -nlp_pash "$PASH_FLAGS --distributed_exec --parallel_pipelines_limit 24" "distr" "rep3" +nlp_pash "$PASH_FLAGS --parallel_pipelines --parallel_pipelines_limit 24" "par" + +nlp_pash "$PASH_FLAGS --distributed_exec" "distr_no_du" + +nlp_pash "$PASH_FLAGS --parallel_pipelines --distributed_exec --parallel_pipelines_limit 24" "distr" diff --git a/evaluation/distr_benchmarks/oneliners/run.distr.sh b/evaluation/distr_benchmarks/oneliners/run.distr.sh index 84913531c..6eeaf36ac 100755 --- a/evaluation/distr_benchmarks/oneliners/run.distr.sh +++ b/evaluation/distr_benchmarks/oneliners/run.distr.sh @@ -27,8 +27,8 @@ oneliners_bash() { touch "$seq_times_file" cat $seq_times_file > $seq_times_file.d - echo executing one-liners $(date) | tee -a "$seq_times_file" - echo '' > "$seq_times_file" + echo executing one-liners $(date) | tee "$seq_times_file" + echo '' >> "$seq_times_file" for script_input in ${scripts_inputs[@]} do @@ -66,8 +66,8 @@ oneliners_pash(){ touch "$times_file" cat $times_file > $times_file.d - echo executing one-liners with $prefix pash with data $rep $(date) | tee -a "$times_file" - echo '' > "$times_file" + echo executing one-liners with $prefix pash with data $rep $(date) | tee "$times_file" + echo '' >> "$times_file" for script_input in ${scripts_inputs[@]} do @@ -92,11 +92,11 @@ oneliners_pash(){ done } -oneliners_bash "rep1" +# oneliners_bash "rep1" oneliners_bash "rep3" -oneliners_pash "$PASH_FLAGS" "par" "rep1" +# oneliners_pash "$PASH_FLAGS" "par" "rep1" oneliners_pash "$PASH_FLAGS" "par" "rep3" -oneliners_pash "$PASH_FLAGS --distributed_exec" "distr" "rep1" +# oneliners_pash "$PASH_FLAGS --distributed_exec" "distr" "rep1" oneliners_pash "$PASH_FLAGS --distributed_exec" "distr" "rep3" diff --git a/evaluation/distr_benchmarks/run_all.sh b/evaluation/distr_benchmarks/run_all.sh new file mode 100755 index 000000000..9162bd352 --- /dev/null +++ b/evaluation/distr_benchmarks/run_all.sh @@ -0,0 +1,5 @@ +cd $PASH_TOP/evaluation/distr_benchmarks/oneliners +bash run.distr.sh + +cd $PASH_TOP/evaluation/distr_benchmarks/nlp +bash run.distr.sh \ No newline at end of file From 747527d8aff54138e4106b02b0ea7b124d2df20d Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Thu, 2 Jun 2022 09:55:34 -0400 Subject: [PATCH 15/37] added gitignores to outputs and inputs --- evaluation/distr_benchmarks/.gitignore | 1 + evaluation/distr_benchmarks/dependency_untangling/.gitignore | 3 +++ evaluation/distr_benchmarks/nlp/.gitignore | 1 + 3 files changed, 5 insertions(+) create mode 100644 evaluation/distr_benchmarks/.gitignore create mode 100644 evaluation/distr_benchmarks/dependency_untangling/.gitignore diff --git a/evaluation/distr_benchmarks/.gitignore b/evaluation/distr_benchmarks/.gitignore new file mode 100644 index 000000000..e6d35e74c --- /dev/null +++ b/evaluation/distr_benchmarks/.gitignore @@ -0,0 +1 @@ +outputs \ No newline at end of file diff --git a/evaluation/distr_benchmarks/dependency_untangling/.gitignore b/evaluation/distr_benchmarks/dependency_untangling/.gitignore new file mode 100644 index 000000000..46428b369 --- /dev/null +++ b/evaluation/distr_benchmarks/dependency_untangling/.gitignore @@ -0,0 +1,3 @@ +input/* +!input/install-deps.sh +!setup.sh \ No newline at end of file diff --git a/evaluation/distr_benchmarks/nlp/.gitignore b/evaluation/distr_benchmarks/nlp/.gitignore index 1dd206e6f..6e99c238a 100644 --- a/evaluation/distr_benchmarks/nlp/.gitignore +++ b/evaluation/distr_benchmarks/nlp/.gitignore @@ -1,2 +1,3 @@ exodus genesis +outputs \ No newline at end of file From 18f0a086b6d12a4488c3fab4b02c628e58e467a6 Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Thu, 2 Jun 2022 11:26:44 -0400 Subject: [PATCH 16/37] use temp ffiles n pure functions --- evaluation/distr_benchmarks/nlp/4_3.sh | 16 +++++++++++++--- evaluation/distr_benchmarks/nlp/4_3b.sh | 10 ++++++---- evaluation/distr_benchmarks/nlp/6_1.sh | 12 +++++++----- evaluation/distr_benchmarks/nlp/8.2_2.sh | 10 ++++++---- evaluation/distr_benchmarks/nlp/8.3_2.sh | 8 +++++--- evaluation/distr_benchmarks/nlp/8.3_3.sh | 12 +++++++----- evaluation/distr_benchmarks/nlp/8_1.sh | 8 +++++--- 7 files changed, 49 insertions(+), 27 deletions(-) diff --git a/evaluation/distr_benchmarks/nlp/4_3.sh b/evaluation/distr_benchmarks/nlp/4_3.sh index 100c78918..e493172ae 100755 --- a/evaluation/distr_benchmarks/nlp/4_3.sh +++ b/evaluation/distr_benchmarks/nlp/4_3.sh @@ -7,12 +7,22 @@ IN=${IN:-/nlp/pg/} OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/4_3/} ENTRIES=${ENTRIES:-1060} mkdir -p "$OUT" +echo $ENTRIES + +pure_func() { + input=$1 + TEMPDIR=$(mktemp -d) + cat > ${TEMPDIR}/${input}.input.words + tail +2 ${TEMPDIR}/${input}.input.words > ${TEMPDIR}/${input}.input.nextwords + paste ${TEMPDIR}/${input}.input.words ${TEMPDIR}/${input}.input.nextwords + + rm -rf ${TEMPDIR} +} +export -f pure_func for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) do - hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" > ${OUT}/${input}.input.words - tail +2 ${OUT}/${input}.input.words > ${OUT}/${input}.input.nextwords - paste ${OUT}/${input}.input.words ${OUT}/${input}.input.nextwords | sort | uniq -c > ${OUT}/${input}.input.bigrams + hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$"| pure_func $input| sort | uniq -c > ${OUT}/${input}.input.bigrams done echo 'done'; diff --git a/evaluation/distr_benchmarks/nlp/4_3b.sh b/evaluation/distr_benchmarks/nlp/4_3b.sh index a77f9dd26..ce9f5b7eb 100755 --- a/evaluation/distr_benchmarks/nlp/4_3b.sh +++ b/evaluation/distr_benchmarks/nlp/4_3b.sh @@ -9,11 +9,13 @@ mkdir -p "$OUT" pure_func() { input=$1 - cat > ${OUT}/${input}.words - tail +2 ${OUT}/${input}.words > ${OUT}/${input}.nextwords - tail +2 ${OUT}/${input}.words > ${OUT}/${input}.nextwords2 - paste ${OUT}/${input}.words ${OUT}/${input}.nextwords ${OUT}/${input}.nextwords2 | + TEMPDIR=$(mktemp -d) + cat > ${TEMPDIR}/${input}.words + tail +2 ${TEMPDIR}/${input}.words > ${TEMPDIR}/${input}.nextwords + tail +2 ${TEMPDIR}/${input}.words > ${TEMPDIR}/${input}.nextwords2 + paste ${TEMPDIR}/${input}.words ${TEMPDIR}/${input}.nextwords ${TEMPDIR}/${input}.nextwords2 | sort | uniq -c + rm -rf ${TEMPDIR} } export -f pure_func for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) diff --git a/evaluation/distr_benchmarks/nlp/6_1.sh b/evaluation/distr_benchmarks/nlp/6_1.sh index 39c328c20..b0836db6e 100755 --- a/evaluation/distr_benchmarks/nlp/6_1.sh +++ b/evaluation/distr_benchmarks/nlp/6_1.sh @@ -9,11 +9,13 @@ mkdir -p "$OUT" trigrams() { input=$1 - tr -sc '[A-Z][a-z]' '[\012*]' > ${OUT}/${input}.words - tail +2 ${OUT}/${input}.words > ${OUT}/${input}.nextwords - tail +3 ${OUT}/${input}.words > ${OUT}/${input}.nextwords2 - paste ${OUT}/${input}.words ${OUT}/${input}.nextwords ${OUT}/${input}.nextwords2 | sort | uniq -c - rm -f ${OUT}/${input}.words ${OUT}/${input}.nextwords ${OUT}/${input}.nextwords2 + TEMPDIR=$(mktemp -d) + tr -sc '[A-Z][a-z]' '[\012*]' > ${TEMPDIR}/${input}.words + tail +2 ${TEMPDIR}/${input}.words > ${TEMPDIR}/${input}.nextwords + tail +3 ${TEMPDIR}/${input}.words > ${TEMPDIR}/${input}.nextwords2 + paste ${TEMPDIR}/${input}.words ${TEMPDIR}/${input}.nextwords ${TEMPDIR}/${input}.nextwords2 | sort | uniq -c + rm -f ${TEMPDIR}/${input}.words ${TEMPDIR}/${input}.nextwords ${TEMPDIR}/${input}.nextwords2 + rm -rf ${TEMPDIR} } export -f trigrams diff --git a/evaluation/distr_benchmarks/nlp/8.2_2.sh b/evaluation/distr_benchmarks/nlp/8.2_2.sh index dc6ec685b..be50f92be 100755 --- a/evaluation/distr_benchmarks/nlp/8.2_2.sh +++ b/evaluation/distr_benchmarks/nlp/8.2_2.sh @@ -10,10 +10,12 @@ mkdir -p "$OUT" pure_func() { input=$1 - cat > ${OUT}/${input}.input.words - tail +2 ${OUT}/${input}.input.words > ${OUT}/${input}.input.nextwords - paste ${OUT}/${input}.input.words ${OUT}/${input}.input.nextwords | sort | uniq -c > ${OUT}/${input}.input.bigrams - awk "\$1 == 2 {print \$2, \$3}" ${OUT}/${input}.input.bigrams + TEMPDIR=$(mktemp -d) + cat > ${TEMPDIR}/${input}.input.words + tail +2 ${TEMPDIR}/${input}.input.words > ${TEMPDIR}/${input}.input.nextwords + paste ${TEMPDIR}/${input}.input.words ${TEMPDIR}/${input}.input.nextwords | sort | uniq -c > ${TEMPDIR}/${input}.input.bigrams + awk "\$1 == 2 {print \$2, \$3}" ${TEMPDIR}/${input}.input.bigrams + rm -rf {TEMPDIR} } export -f pure_func diff --git a/evaluation/distr_benchmarks/nlp/8.3_2.sh b/evaluation/distr_benchmarks/nlp/8.3_2.sh index 47454d3b8..59265d767 100755 --- a/evaluation/distr_benchmarks/nlp/8.3_2.sh +++ b/evaluation/distr_benchmarks/nlp/8.3_2.sh @@ -9,9 +9,11 @@ mkdir -p "$OUT" pure_func() { input=$1 - sort -u > ${OUT}/${input}.types - rev < ${OUT}/${input}.types > ${OUT}/${input}.types.rev - sort ${OUT}/${input}.types ${OUT}/${input}.types.rev | uniq -c | awk "\$1 >= 2 {print \$2}" + TEMPDIR=$(mktemp -d) + sort -u > ${TEMPDIR}/${input}.types + rev < ${TEMPDIR}/${input}.types > ${TEMPDIR}/${input}.types.rev + sort ${TEMPDIR}/${input}.types ${TEMPDIR}/${input}.types.rev | uniq -c | awk "\$1 >= 2 {print \$2}" + rm -rf ${TEMPDIR} } export -f pure_func diff --git a/evaluation/distr_benchmarks/nlp/8.3_3.sh b/evaluation/distr_benchmarks/nlp/8.3_3.sh index 71b873a21..18af5aef6 100755 --- a/evaluation/distr_benchmarks/nlp/8.3_3.sh +++ b/evaluation/distr_benchmarks/nlp/8.3_3.sh @@ -5,17 +5,19 @@ IN=${IN:-/nlp/pg/} INPUT2=${INPUT2:-/nlp/exodus} OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/8.3_3/} -ENTRIES=${ENTRIES:-1060} +ENTRIES=${ENTRIES:-1} mkdir -p $OUT pure_func() { input=$1 - cat > ${OUT}/${input}1.types - hdfs dfs -cat ${INPUT2} | tr -sc '[A-Z][a-z]' '[\012*]' | sort -u > ${OUT}/${input}2.types - sort $OUT/${input}1.types ${OUT}/${input}2.types ${OUT}/${input}2.types | uniq -c | head - + TEMPDIR=$(mktemp -d) + cat > ${TEMPDIR}/${input}1.types + hdfs dfs -cat ${INPUT2} | tr -sc '[A-Z][a-z]' '[\012*]' | sort -u > ${TEMPDIR}/${input}2.types + sort ${TEMPDIR}/${input}1.types ${TEMPDIR}/${input}2.types ${TEMPDIR}/${input}2.types | uniq -c | head + rm -rf ${TEMPDIR} } export -f pure_func + for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) do hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | sort -u | pure_func $input > ${OUT}/${input}.out diff --git a/evaluation/distr_benchmarks/nlp/8_1.sh b/evaluation/distr_benchmarks/nlp/8_1.sh index 7973476ba..07a27ed22 100755 --- a/evaluation/distr_benchmarks/nlp/8_1.sh +++ b/evaluation/distr_benchmarks/nlp/8_1.sh @@ -9,9 +9,11 @@ mkdir -p "$OUT" pure_func() { input=$1 - cat > ${OUT}/${input}.words - tr -sc '[AEIOUaeiou\012]' ' ' < ${OUT}/${input}.words | awk '{print NF}' > ${OUT}/${input}.syl - paste ${OUT}/${input}.syl ${OUT}/${input}.words | sort -nr | sed 5q + TEMPDIR=$(mktemp -d) + cat > ${TEMPDIR}/${input}.words + tr -sc '[AEIOUaeiou\012]' ' ' < ${TEMPDIR}/${input}.words | awk '{print NF}' > ${TEMPDIR}/${input}.syl + paste ${TEMPDIR}/${input}.syl ${TEMPDIR}/${input}.words | sort -nr | sed 5q + rm -rf ${TEMPDIR} } export -f pure_func for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) From fd068837df31c5440f72f65eddcfd166e6a9fc8f Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Thu, 2 Jun 2022 11:36:10 -0400 Subject: [PATCH 17/37] minor nlp fixes --- evaluation/distr_benchmarks/nlp/4_3.sh | 1 - evaluation/distr_benchmarks/nlp/6_1.sh | 1 - evaluation/distr_benchmarks/nlp/7_1.sh | 2 +- evaluation/distr_benchmarks/nlp/8.3_3.sh | 2 +- 4 files changed, 2 insertions(+), 4 deletions(-) diff --git a/evaluation/distr_benchmarks/nlp/4_3.sh b/evaluation/distr_benchmarks/nlp/4_3.sh index e493172ae..e817e36b8 100755 --- a/evaluation/distr_benchmarks/nlp/4_3.sh +++ b/evaluation/distr_benchmarks/nlp/4_3.sh @@ -15,7 +15,6 @@ pure_func() { cat > ${TEMPDIR}/${input}.input.words tail +2 ${TEMPDIR}/${input}.input.words > ${TEMPDIR}/${input}.input.nextwords paste ${TEMPDIR}/${input}.input.words ${TEMPDIR}/${input}.input.nextwords - rm -rf ${TEMPDIR} } export -f pure_func diff --git a/evaluation/distr_benchmarks/nlp/6_1.sh b/evaluation/distr_benchmarks/nlp/6_1.sh index b0836db6e..5b4181fb2 100755 --- a/evaluation/distr_benchmarks/nlp/6_1.sh +++ b/evaluation/distr_benchmarks/nlp/6_1.sh @@ -14,7 +14,6 @@ trigrams() { tail +2 ${TEMPDIR}/${input}.words > ${TEMPDIR}/${input}.nextwords tail +3 ${TEMPDIR}/${input}.words > ${TEMPDIR}/${input}.nextwords2 paste ${TEMPDIR}/${input}.words ${TEMPDIR}/${input}.nextwords ${TEMPDIR}/${input}.nextwords2 | sort | uniq -c - rm -f ${TEMPDIR}/${input}.words ${TEMPDIR}/${input}.nextwords ${TEMPDIR}/${input}.nextwords2 rm -rf ${TEMPDIR} } export -f trigrams diff --git a/evaluation/distr_benchmarks/nlp/7_1.sh b/evaluation/distr_benchmarks/nlp/7_1.sh index 7f3f81518..c78172cb2 100755 --- a/evaluation/distr_benchmarks/nlp/7_1.sh +++ b/evaluation/distr_benchmarks/nlp/7_1.sh @@ -13,4 +13,4 @@ do done echo 'done'; -rm ${OUT} +rm -rf ${OUT} diff --git a/evaluation/distr_benchmarks/nlp/8.3_3.sh b/evaluation/distr_benchmarks/nlp/8.3_3.sh index 18af5aef6..937522b3f 100755 --- a/evaluation/distr_benchmarks/nlp/8.3_3.sh +++ b/evaluation/distr_benchmarks/nlp/8.3_3.sh @@ -5,7 +5,7 @@ IN=${IN:-/nlp/pg/} INPUT2=${INPUT2:-/nlp/exodus} OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/8.3_3/} -ENTRIES=${ENTRIES:-1} +ENTRIES=${ENTRIES:-1060} mkdir -p $OUT pure_func() { From 051df829eb4ef34e71943d31ab8f61d6da09b309 Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Thu, 2 Jun 2022 23:56:26 -0600 Subject: [PATCH 18/37] ported unix50 for distributed exec --- evaluation/distr_benchmarks/unix50/.gitignore | 3 + evaluation/distr_benchmarks/unix50/1.sh | 6 + evaluation/distr_benchmarks/unix50/10.sh | 6 + evaluation/distr_benchmarks/unix50/11.sh | 6 + evaluation/distr_benchmarks/unix50/12.sh | 6 + evaluation/distr_benchmarks/unix50/13.sh | 6 + evaluation/distr_benchmarks/unix50/14.sh | 6 + evaluation/distr_benchmarks/unix50/15.sh | 6 + evaluation/distr_benchmarks/unix50/16.sh | 6 + evaluation/distr_benchmarks/unix50/17.sh | 6 + evaluation/distr_benchmarks/unix50/18.sh | 6 + evaluation/distr_benchmarks/unix50/19.sh | 6 + evaluation/distr_benchmarks/unix50/2.sh | 6 + evaluation/distr_benchmarks/unix50/20.sh | 6 + evaluation/distr_benchmarks/unix50/21.sh | 6 + evaluation/distr_benchmarks/unix50/22.sh | 6 + evaluation/distr_benchmarks/unix50/23.sh | 6 + evaluation/distr_benchmarks/unix50/24.sh | 6 + evaluation/distr_benchmarks/unix50/25.sh | 6 + evaluation/distr_benchmarks/unix50/26.sh | 6 + evaluation/distr_benchmarks/unix50/27.sh | 6 + evaluation/distr_benchmarks/unix50/28.sh | 6 + evaluation/distr_benchmarks/unix50/29.sh | 6 + evaluation/distr_benchmarks/unix50/3.sh | 6 + evaluation/distr_benchmarks/unix50/30.sh | 6 + evaluation/distr_benchmarks/unix50/31.sh | 6 + evaluation/distr_benchmarks/unix50/32.sh | 6 + evaluation/distr_benchmarks/unix50/33.sh | 6 + evaluation/distr_benchmarks/unix50/34.sh | 6 + evaluation/distr_benchmarks/unix50/35.sh | 6 + evaluation/distr_benchmarks/unix50/36.sh | 5 + evaluation/distr_benchmarks/unix50/4.sh | 6 + evaluation/distr_benchmarks/unix50/5.sh | 6 + evaluation/distr_benchmarks/unix50/6.sh | 6 + evaluation/distr_benchmarks/unix50/7.sh | 6 + evaluation/distr_benchmarks/unix50/8.sh | 6 + evaluation/distr_benchmarks/unix50/9.sh | 6 + .../distr_benchmarks/unix50/input/setup.sh | 69 ++++++++ .../unix50/input/split-unix50.sh | 17 ++ .../distr_benchmarks/unix50/input/unix50.sh | 151 ++++++++++++++++++ .../distr_benchmarks/unix50/run.distr.sh | 78 +++++++++ 41 files changed, 533 insertions(+) create mode 100644 evaluation/distr_benchmarks/unix50/.gitignore create mode 100755 evaluation/distr_benchmarks/unix50/1.sh create mode 100755 evaluation/distr_benchmarks/unix50/10.sh create mode 100755 evaluation/distr_benchmarks/unix50/11.sh create mode 100755 evaluation/distr_benchmarks/unix50/12.sh create mode 100755 evaluation/distr_benchmarks/unix50/13.sh create mode 100755 evaluation/distr_benchmarks/unix50/14.sh create mode 100755 evaluation/distr_benchmarks/unix50/15.sh create mode 100755 evaluation/distr_benchmarks/unix50/16.sh create mode 100755 evaluation/distr_benchmarks/unix50/17.sh create mode 100755 evaluation/distr_benchmarks/unix50/18.sh create mode 100755 evaluation/distr_benchmarks/unix50/19.sh create mode 100755 evaluation/distr_benchmarks/unix50/2.sh create mode 100755 evaluation/distr_benchmarks/unix50/20.sh create mode 100755 evaluation/distr_benchmarks/unix50/21.sh create mode 100755 evaluation/distr_benchmarks/unix50/22.sh create mode 100755 evaluation/distr_benchmarks/unix50/23.sh create mode 100755 evaluation/distr_benchmarks/unix50/24.sh create mode 100755 evaluation/distr_benchmarks/unix50/25.sh create mode 100755 evaluation/distr_benchmarks/unix50/26.sh create mode 100755 evaluation/distr_benchmarks/unix50/27.sh create mode 100755 evaluation/distr_benchmarks/unix50/28.sh create mode 100755 evaluation/distr_benchmarks/unix50/29.sh create mode 100755 evaluation/distr_benchmarks/unix50/3.sh create mode 100755 evaluation/distr_benchmarks/unix50/30.sh create mode 100755 evaluation/distr_benchmarks/unix50/31.sh create mode 100755 evaluation/distr_benchmarks/unix50/32.sh create mode 100755 evaluation/distr_benchmarks/unix50/33.sh create mode 100755 evaluation/distr_benchmarks/unix50/34.sh create mode 100755 evaluation/distr_benchmarks/unix50/35.sh create mode 100755 evaluation/distr_benchmarks/unix50/36.sh create mode 100755 evaluation/distr_benchmarks/unix50/4.sh create mode 100755 evaluation/distr_benchmarks/unix50/5.sh create mode 100755 evaluation/distr_benchmarks/unix50/6.sh create mode 100755 evaluation/distr_benchmarks/unix50/7.sh create mode 100755 evaluation/distr_benchmarks/unix50/8.sh create mode 100755 evaluation/distr_benchmarks/unix50/9.sh create mode 100755 evaluation/distr_benchmarks/unix50/input/setup.sh create mode 100755 evaluation/distr_benchmarks/unix50/input/split-unix50.sh create mode 100755 evaluation/distr_benchmarks/unix50/input/unix50.sh create mode 100755 evaluation/distr_benchmarks/unix50/run.distr.sh diff --git a/evaluation/distr_benchmarks/unix50/.gitignore b/evaluation/distr_benchmarks/unix50/.gitignore new file mode 100644 index 000000000..30547eafd --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/.gitignore @@ -0,0 +1,3 @@ +inputs/* +intermediary/* +*.txt diff --git a/evaluation/distr_benchmarks/unix50/1.sh b/evaluation/distr_benchmarks/unix50/1.sh new file mode 100755 index 000000000..9684112ce --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/1.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN1=$IN_PRE/1.txt +# 1.0: extract the last name +hdfs dfs -cat $IN1 | cut -d ' ' -f 2 + diff --git a/evaluation/distr_benchmarks/unix50/10.sh b/evaluation/distr_benchmarks/unix50/10.sh new file mode 100755 index 000000000..3ef1eef49 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/10.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN4=$IN_PRE/4.txt +# 4.4: histogram of Belle's captures (-pawns) by each type of piece +hdfs dfs -cat $IN4 | tr ' ' '\n' | grep 'x' | grep '\.' | cut -d '.' -f 2 | grep '[KQRBN]' | cut -c 1-1 | sort | uniq -c | sort -nr + diff --git a/evaluation/distr_benchmarks/unix50/11.sh b/evaluation/distr_benchmarks/unix50/11.sh new file mode 100755 index 000000000..0bee1dba1 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/11.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN4=$IN_PRE/4.txt +# 4.5: 4.4 + pawns +hdfs dfs -cat $IN4 | tr ' ' '\n' | grep 'x' | grep '\.' | cut -d '.' -f 2 | cut -c 1-1 | tr '[a-z]' 'P' | sort | uniq -c | sort -nr + diff --git a/evaluation/distr_benchmarks/unix50/12.sh b/evaluation/distr_benchmarks/unix50/12.sh new file mode 100755 index 000000000..5bf77c806 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/12.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN4=$IN_PRE/4.txt +# 4.6: piece used the most by Belle +hdfs dfs -cat $IN4 | tr ' ' '\n' | grep '\.' | cut -d '.' -f 2 | cut -c 1-1 | tr '[a-z]' 'P' | sort -r | uniq | head -n 3 | tail -n 1 + diff --git a/evaluation/distr_benchmarks/unix50/13.sh b/evaluation/distr_benchmarks/unix50/13.sh new file mode 100755 index 000000000..9702a3861 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/13.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN5=$IN_PRE/5.txt +# 5.1: extract hello world +hdfs dfs -cat $IN5 | grep 'print' | cut -d "\"" -f 2 | cut -c 1-12 + diff --git a/evaluation/distr_benchmarks/unix50/14.sh b/evaluation/distr_benchmarks/unix50/14.sh new file mode 100755 index 000000000..f67e4d81a --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/14.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN6=$IN_PRE/6.txt +# 6.1: order the bodies by how easy it would be to land on them in Thompson's Space Travel game when playing at the highest simulation scale +hdfs dfs -cat $IN6 | awk "{print \$2, \$0}" | sort -nr | cut -d ' ' -f 2 + diff --git a/evaluation/distr_benchmarks/unix50/15.sh b/evaluation/distr_benchmarks/unix50/15.sh new file mode 100755 index 000000000..abe5c620c --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/15.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN7=$IN_PRE/7.txt +# 7.1: identify number of AT&T unix versions +hdfs dfs -cat $IN7 | cut -f 1 | grep 'AT&T' | wc -l + diff --git a/evaluation/distr_benchmarks/unix50/16.sh b/evaluation/distr_benchmarks/unix50/16.sh new file mode 100755 index 000000000..7b95222b5 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/16.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN7=$IN_PRE/7.txt +# 7.2: find most frequently occurring machine +hdfs dfs -cat $IN7 | cut -f 2 | sort -n | uniq -c | sort -nr | head -n 1 | tr -s ' ' '\n' | tail -n 1 + diff --git a/evaluation/distr_benchmarks/unix50/17.sh b/evaluation/distr_benchmarks/unix50/17.sh new file mode 100755 index 000000000..3a0246204 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/17.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN7=$IN_PRE/7.txt +# 7.3: all the decades in which a unix version was released +hdfs dfs -cat $IN7 | cut -f 4 | sort -n | cut -c 3-3 | uniq | sed s/\$/'0s'/ + diff --git a/evaluation/distr_benchmarks/unix50/18.sh b/evaluation/distr_benchmarks/unix50/18.sh new file mode 100755 index 000000000..c90d3ff9b --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/18.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN8=$IN_PRE/8.txt +# 8.1: count unix birth-year +hdfs dfs -cat $IN8 | tr ' ' '\n' | grep 1969 | wc -l + diff --git a/evaluation/distr_benchmarks/unix50/19.sh b/evaluation/distr_benchmarks/unix50/19.sh new file mode 100755 index 000000000..5eaceae8d --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/19.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN8=$IN_PRE/8.txt +# 8.2: find Bell Labs location where Dennis Ritchie had his office +hdfs dfs -cat $IN8 | grep 'Bell' | awk 'length <= 45' | cut -d ',' -f 2 | awk "{\$1=\$1};1" + diff --git a/evaluation/distr_benchmarks/unix50/2.sh b/evaluation/distr_benchmarks/unix50/2.sh new file mode 100755 index 000000000..82eb4b460 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/2.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN1=$IN_PRE/1.txt +# 1.1: extract names and sort +hdfs dfs -cat $IN1 | cut -d ' ' -f 2 | sort + diff --git a/evaluation/distr_benchmarks/unix50/20.sh b/evaluation/distr_benchmarks/unix50/20.sh new file mode 100755 index 000000000..3d121b839 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/20.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN8=$IN_PRE/8.txt +# 8.3: find names of the four people most involved with unix +hdfs dfs -cat $IN8 | grep '(' | cut -d '(' -f 2 | cut -d ')' -f 1 | head -n 1 + diff --git a/evaluation/distr_benchmarks/unix50/21.sh b/evaluation/distr_benchmarks/unix50/21.sh new file mode 100755 index 000000000..9578ad223 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/21.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN8=$IN_PRE/8.txt +# 8.4: find longest words without hyphens +hdfs dfs -cat $IN8 | tr -c "[a-z][A-Z]" '\n' | sort | awk "length >= 16" + diff --git a/evaluation/distr_benchmarks/unix50/22.sh b/evaluation/distr_benchmarks/unix50/22.sh new file mode 100755 index 000000000..44ccb21f6 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/22.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN8=$IN_PRE/8.txt +# # 8.5: Find second-most-freq 8-character word(s) without hyphens +# cat $IN8 > /dev/null + diff --git a/evaluation/distr_benchmarks/unix50/23.sh b/evaluation/distr_benchmarks/unix50/23.sh new file mode 100755 index 000000000..76cf0f938 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/23.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN91=$IN_PRE/9.1.txt +# 9.1: extract the word PORT +hdfs dfs -cat $IN91 | tr ' ' '\n' | grep '[A-Z]' | tr '[a-z]' '\n' | grep '[A-Z]' | tr -d '\n' | cut -c 1-4 + diff --git a/evaluation/distr_benchmarks/unix50/24.sh b/evaluation/distr_benchmarks/unix50/24.sh new file mode 100755 index 000000000..9ff9bf821 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/24.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN92=$IN_PRE/9.2.txt +# 9.2: extract the word BELL +hdfs dfs -cat $IN92 | cut -c 1-1 | tr -d '\n' + diff --git a/evaluation/distr_benchmarks/unix50/25.sh b/evaluation/distr_benchmarks/unix50/25.sh new file mode 100755 index 000000000..b8f983ec0 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/25.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN93=$IN_PRE/9.3.txt +# 9.3: animal that used to decorate the Unix room +hdfs dfs -cat $IN93 | cut -c 1-2 | tr -d '\n' + diff --git a/evaluation/distr_benchmarks/unix50/26.sh b/evaluation/distr_benchmarks/unix50/26.sh new file mode 100755 index 000000000..aae9b34e4 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/26.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN94=$IN_PRE/9.4.txt +# 9.4: four corners with E centered, for an "X" configuration +hdfs dfs -cat $IN94 | tr ' ' '\n' | grep "\"" | sed 4d | cut -d "\"" -f 2 | tr -d '\n' + diff --git a/evaluation/distr_benchmarks/unix50/27.sh b/evaluation/distr_benchmarks/unix50/27.sh new file mode 100755 index 000000000..41b1e5577 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/27.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN95=$IN_PRE/9.5.txt +# # 9.5: backwards running clock, in a backwards poem +# cat $IN95 > /dev/null + diff --git a/evaluation/distr_benchmarks/unix50/28.sh b/evaluation/distr_benchmarks/unix50/28.sh new file mode 100755 index 000000000..3f44d5cd3 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/28.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN96=$IN_PRE/9.6.txt +# 9.6: Follow the directions for grep +hdfs dfs -cat $IN96 | tr ' ' '\n' | grep '[A-Z]' | sed 1d | sed 3d | sed 3d | tr '[a-z]' '\n' | grep '[A-Z]' | sed 3d | tr -c '[A-Z]' '\n' | tr -d '\n' + diff --git a/evaluation/distr_benchmarks/unix50/29.sh b/evaluation/distr_benchmarks/unix50/29.sh new file mode 100755 index 000000000..bb41389a0 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/29.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN97=$IN_PRE/9.7.txt +# 9.7: Four corners +hdfs dfs -cat $IN97 | sed 2d | sed 2d | tr -c '[A-Z]' '\n' | tr -d '\n' + diff --git a/evaluation/distr_benchmarks/unix50/3.sh b/evaluation/distr_benchmarks/unix50/3.sh new file mode 100755 index 000000000..0519768a6 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/3.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN1=$IN_PRE/1.txt +# 1.2: extract names and sort +hdfs dfs -cat $IN1 | head -n 2 | cut -d ' ' -f 2 + diff --git a/evaluation/distr_benchmarks/unix50/30.sh b/evaluation/distr_benchmarks/unix50/30.sh new file mode 100755 index 000000000..bb13b5d36 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/30.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN98=$IN_PRE/9.8.txt +# 9.8: TELE-communications +hdfs dfs -cat $IN98 | tr -c '[a-z][A-Z]' '\n' | grep '[A-Z]' | sed 1d | sed 2d | sed 3d | sed 4d | tr -c '[A-Z]' '\n' | tr -d '\n' + diff --git a/evaluation/distr_benchmarks/unix50/31.sh b/evaluation/distr_benchmarks/unix50/31.sh new file mode 100755 index 000000000..e9ba29c14 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/31.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN99=$IN_PRE/9.9.txt +# 9.9: +hdfs dfs -cat $IN99 | tr -c '[a-z][A-Z]' '\n' | grep '[A-Z]' | sed 1d | sed 1d | sed 2d | sed 3d | sed 5d | tr -c '[A-Z]' '\n' | tr -d '\n' + diff --git a/evaluation/distr_benchmarks/unix50/32.sh b/evaluation/distr_benchmarks/unix50/32.sh new file mode 100755 index 000000000..a3183b7bd --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/32.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN10=$IN_PRE/10.txt +# 10.1: count Turing award recipients while working at Bell Labs +hdfs dfs -cat $IN10 | sed 1d | grep 'Bell' | cut -f 2 | wc -l + diff --git a/evaluation/distr_benchmarks/unix50/33.sh b/evaluation/distr_benchmarks/unix50/33.sh new file mode 100755 index 000000000..d9c1675b4 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/33.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN10=$IN_PRE/10.txt +# 10.2: list Turing award recipients while working at Bell Labs +hdfs dfs -cat $IN10 | sed 1d | grep 'Bell' | cut -f 2 + diff --git a/evaluation/distr_benchmarks/unix50/34.sh b/evaluation/distr_benchmarks/unix50/34.sh new file mode 100755 index 000000000..d08551141 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/34.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN10=$IN_PRE/10.txt +# 10.3: extract Ritchie's username +hdfs dfs -cat $IN10 | grep 'Bell' | cut -f 2 | head -n 1 | fmt -w1 | cut -c 1-1 | tr -d '\n' | tr '[A-Z]' '[a-z]' + diff --git a/evaluation/distr_benchmarks/unix50/35.sh b/evaluation/distr_benchmarks/unix50/35.sh new file mode 100755 index 000000000..78436d485 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/35.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN11=$IN_PRE/11.txt +# 11.1: year Ritchie and Thompson receive the Hamming medal +hdfs dfs -cat $IN11 | grep 'UNIX' | cut -f 1 + diff --git a/evaluation/distr_benchmarks/unix50/36.sh b/evaluation/distr_benchmarks/unix50/36.sh new file mode 100755 index 000000000..376f8f23d --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/36.sh @@ -0,0 +1,5 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN11=$IN_PRE/11.txt +# 11.2: most repeated first name in the list? +hdfs dfs -cat $IN11 | cut -f 2 | cut -d ' ' -f 1 | sort | uniq -c | sort -nr | head -n 1 | fmt -w1 | sed 1d diff --git a/evaluation/distr_benchmarks/unix50/4.sh b/evaluation/distr_benchmarks/unix50/4.sh new file mode 100755 index 000000000..e36bb9119 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/4.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN1=$IN_PRE/1.txt +# 1.3: sort top first names +hdfs dfs -cat $IN1 | cut -d ' ' -f 1 | sort | uniq -c | sort -r + diff --git a/evaluation/distr_benchmarks/unix50/5.sh b/evaluation/distr_benchmarks/unix50/5.sh new file mode 100755 index 000000000..32148f8da --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/5.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN2=$IN_PRE/2.txt +# 2.1: get all Unix utilities +hdfs dfs -cat $IN2 | cut -d ' ' -f 4 | tr -d ',' + diff --git a/evaluation/distr_benchmarks/unix50/6.sh b/evaluation/distr_benchmarks/unix50/6.sh new file mode 100755 index 000000000..79147caf6 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/6.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN3=$IN_PRE/3.txt +# 3.1: get lowercase first letter of last names (awk) +hdfs dfs -cat $IN3 | cut -d ' ' -f 2 | cut -c 1-1 | tr -d '\n' | tr '[A-Z]' '[a-z]' + diff --git a/evaluation/distr_benchmarks/unix50/7.sh b/evaluation/distr_benchmarks/unix50/7.sh new file mode 100755 index 000000000..f9d4a9908 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/7.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN4=$IN_PRE/4.txt +# 4.1: find number of rounds +hdfs dfs -cat $IN4 | tr ' ' '\n' | grep '\.' | wc -l + diff --git a/evaluation/distr_benchmarks/unix50/8.sh b/evaluation/distr_benchmarks/unix50/8.sh new file mode 100755 index 000000000..a0bb5153d --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/8.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN4=$IN_PRE/4.txt +# 4.2: find pieces captured by Belle +hdfs dfs -cat $IN4 | tr ' ' '\n' | grep 'x' | grep '\.' | wc -l + diff --git a/evaluation/distr_benchmarks/unix50/9.sh b/evaluation/distr_benchmarks/unix50/9.sh new file mode 100755 index 000000000..d31ba769a --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/9.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export IN_PRE=${IN_PRE:-/unix50} +IN4=$IN_PRE/4.txt +# 4.3: find pieces captured by Belle with a pawn +hdfs dfs -cat $IN4 | tr ' ' '\n' | grep 'x' | grep '\.' | cut -d '.' -f 2 | grep -v '[KQRBN]' | wc -l + diff --git a/evaluation/distr_benchmarks/unix50/input/setup.sh b/evaluation/distr_benchmarks/unix50/input/setup.sh new file mode 100755 index 000000000..01d7aaa21 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/input/setup.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +#set -e + +PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)} + +## FIXME: These inputs are already 1G when downloaded +## FIXME: Also, wget is not silent like curl in the other setup scripts. + +inputs=( +1 10 11 12 2 3 4 5 6 7 8 9.1 9.2 9.3 9.4 9.5 9.6 9.7 9.8 9.9 +) + +if [[ "$1" == "-c" ]]; then + for input in ${inputs[@]} + do + rm -f "${input}.txt" + done + exit +fi + +setup_dataset() { + # Put files in hdfs + hdfs dfs -mkdir /unix50 + + # generate small inputs + # if [ "$#" -eq 1 ] && [ "$1" = "--small" ]; then + # if [ ! -d ./small ]; then + # echo "Generating small-size inputs" + # # FIXME PR: Do we need all of them? + # curl -sf 'http://pac-n4.csail.mit.edu:81/pash_data/small/unix50.zip' > unix50.zip + # unzip unix50.zip + # rm -f unix50.zip + # fi + # hdfs dfs -put small /unix50/small + # return 0 + # fi + + for input in ${inputs[@]} + do + if [ ! -f "${input}.txt" ]; then + wget "http://ndr.md/data/unix50/${input}.txt" + "$PASH_TOP/scripts/append_nl_if_not.sh" "${input}.txt" + fi + hdfs dfs -put $file /unix50/$file + done + + # increase the original input size 10x + if [ "$#" -eq 1 ] && [ "$1" = "--extended" ]; then + EXTENDED_INPUT_DIR="extended_input/" + mkdir -p $EXTENDED_INPUT_DIR + for file in *.txt; do + rm $EXTENDED_INPUT_DIR/$file + for (( i = 0; i < 10; i++ )); do + cat $file >> $EXTENDED_INPUT_DIR/temp.txt + done + done + hdfs dfs -put $EXTENDED_INPUT_DIR /unix50/$EXTENDED_INPUT_DIR + rm -rf $EXTENDED_INPUT_DIR + fi +} + +source_var() { + if [[ "$1" == "--extended" ]]; then + export IN_PRE=/unix50/extended_input + else + export IN_PRE=/unix50 + fi +} diff --git a/evaluation/distr_benchmarks/unix50/input/split-unix50.sh b/evaluation/distr_benchmarks/unix50/input/split-unix50.sh new file mode 100755 index 000000000..a0afe145d --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/input/split-unix50.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +awk -v RS= '{print > (NR ".txt")}' unix50.sh + +for file in *.txt; do + fname=$(basename -- "$file") + fscript="${fname%.*}".sh + echo $fscript + echo '#!/bin/bash' > $fscript + + echo 'export IN_PRE=${IN_PRE:-$PASH_TOP/evaluation/benchmarks/unix50/input}' >> $fscript + input=$(grep -o 'IN..' $file) + grep "^$(echo $input | xargs)=" unix50.sh >> $fscript + cat $file >> $fscript + echo '' >> $fscript +done + diff --git a/evaluation/distr_benchmarks/unix50/input/unix50.sh b/evaluation/distr_benchmarks/unix50/input/unix50.sh new file mode 100755 index 000000000..7c5182bc8 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/input/unix50.sh @@ -0,0 +1,151 @@ +#!/bin/bash + +# scripts from https://unixgame.io/ +# https://github.com/psinghbh/softsec.github.io +# input files https://github.com/psinghbh/softsec.github.io/tree/master/ctf/unixgame.io/challenges +# Which join is easier: http://www.theunixschool.com/2011/08/5-different-ways-to-join-all-lines-in.html +# 1 (default) + 3 + 1 + 1 + 6 + 1 + 1 + 3 + 5 + 9 + 3 + 2 + 1 = 37 (there are 3 missing) +# missing 8.5, 9.5, 12.1 + +if [[ -z "$IN_PRE" ]]; then + if [[ -z "$PASH_TOP" ]]; then + echo "Need to provide PASH_TOP, possibly $(git rev-parse --show-toplevel)" 1>&2 + exit 1 + else + export IN_PRE=$PASH_TOP/evaluation/benchmarks/unix50/input + fi +fi + +IN1=$IN_PRE/1.txt +IN2=$IN_PRE/2.txt +IN3=$IN_PRE/3.txt +IN4=$IN_PRE/4.txt +IN5=$IN_PRE/5.txt +IN6=$IN_PRE/6.txt +IN7=$IN_PRE/7.txt +IN8=$IN_PRE/8.txt +IN91=$IN_PRE/9.1.txt +IN92=$IN_PRE/9.2.txt +IN93=$IN_PRE/9.3.txt +IN94=$IN_PRE/9.4.txt +IN95=$IN_PRE/9.5.txt +IN96=$IN_PRE/9.6.txt +IN97=$IN_PRE/9.7.txt +IN98=$IN_PRE/9.8.txt +IN99=$IN_PRE/9.9.txt +IN10=$IN_PRE/10.txt +IN11=$IN_PRE/11.txt +IN12=$IN_PRE/12.txt + +# 1.0: extract the last name +cat $IN1 | cut -d ' ' -f 2 + +# 1.1: extract names and sort +cat $IN1 | cut -d ' ' -f 2 | sort + +# 1.2: extract names and sort +cat $IN1 | head -n 2 | cut -d ' ' -f 2 + +# 1.3: sort top first names +cat $IN1 | cut -d ' ' -f 1 | sort | uniq -c | sort -r + +# 2.1: get all Unix utilities +cat $IN2 | cut -d ' ' -f 4 | tr -d ',' + +# 3.1: get lowercase first letter of last names (awk) +cat $IN3 | cut -d ' ' -f 2 | cut -c 1-1 | tr -d '\n' | tr '[A-Z]' '[a-z]' + +# 4.1: find number of rounds +cat $IN4 | tr ' ' '\n' | grep '\.' | wc -l + +# 4.2: find pieces captured by Belle +cat $IN4 | tr ' ' '\n' | grep 'x' | grep '\.' | wc -l + +# 4.3: find pieces captured by Belle with a pawn +cat $IN4 | tr ' ' '\n' | grep 'x' | grep '\.' | cut -d '.' -f 2 | grep -v '[KQRBN]' | wc -l + +# 4.4: histogram of Belle's captures (-pawns) by each type of piece +cat $IN4 | tr ' ' '\n' | grep 'x' | grep '\.' | cut -d '.' -f 2 | grep '[KQRBN]' | cut -c 1-1 | sort | uniq -c | sort -nr + +# 4.5: 4.4 + pawns +cat $IN4 | tr ' ' '\n' | grep 'x' | grep '\.' | cut -d '.' -f 2 | cut -c 1-1 | tr '[a-z]' 'P' | sort | uniq -c | sort -nr + +# 4.6: piece used the most by Belle +cat $IN4 | tr ' ' '\n' | grep '\.' | cut -d '.' -f 2 | cut -c 1-1 | tr '[a-z]' 'P' | sort -r | uniq | head -n 3 | tail -n 1 + +# 5.1: extract hello world +cat $IN5 | grep 'print' | cut -d "\"" -f 2 | cut -c 1-12 + +# 6.1: order the bodies by how easy it would be to land on them in Thompson's Space Travel game when playing at the highest simulation scale +cat $IN6 | awk "{print \$2, \$0}" | sort -nr | cut -d ' ' -f 2 + +# 7.1: identify number of AT&T unix versions +cat $IN7 | cut -f 1 | grep 'AT&T' | wc -l + +# 7.2: find most frequently occurring machine +cat $IN7 | cut -f 2 | sort -n | uniq -c | sort -nr | head -n 1 | tr -s ' ' '\n' | tail -n 1 + +# 7.3: all the decades in which a unix version was released +cat $IN7 | cut -f 4 | sort -n | cut -c 3-3 | uniq | sed s/\$/'0s'/ + +# 8.1: count unix birth-year +cat $IN8 | tr ' ' '\n' | grep 1969 | wc -l + +# 8.2: find Bell Labs location where Dennis Ritchie had his office +cat $IN8 | grep 'Bell' | awk 'length <= 45' | cut -d ',' -f 2 | awk "{\$1=\$1};1" + +# 8.3: find names of the four people most involved with unix +cat $IN8 | grep '(' | cut -d '(' -f 2 | cut -d ')' -f 1 | head -n 1 + +# 8.4: find longest words without hyphens +cat $IN8 | tr -c "[a-z][A-Z]" '\n' | sort | awk "length >= 16" + +# # 8.5: Find second-most-freq 8-character word(s) without hyphens +# cat $IN8 > /dev/null + +# 9.1: extract the word PORT +cat $IN91 | tr ' ' '\n' | grep '[A-Z]' | tr '[a-z]' '\n' | grep '[A-Z]' | tr -d '\n' | cut -c 1-4 + +# 9.2: extract the word BELL +cat $IN92 | cut -c 1-1 | tr -d '\n' + +# 9.3: animal that used to decorate the Unix room +cat $IN93 | cut -c 1-2 | tr -d '\n' + +# 9.4: four corners with E centered, for an "X" configuration +cat $IN94 | tr ' ' '\n' | grep "\"" | sed 4d | cut -d "\"" -f 2 | tr -d '\n' + +# # 9.5: backwards running clock, in a backwards poem +# cat $IN95 > /dev/null + +# 9.6: Follow the directions for grep +cat $IN96 | tr ' ' '\n' | grep '[A-Z]' | sed 1d | sed 3d | sed 3d | tr '[a-z]' '\n' | grep '[A-Z]' | sed 3d | tr -c '[A-Z]' '\n' | tr -d '\n' + +# 9.7: Four corners +cat $IN97 | sed 2d | sed 2d | tr -c '[A-Z]' '\n' | tr -d '\n' + +# 9.8: TELE-communications +cat $IN98 | tr -c '[a-z][A-Z]' '\n' | grep '[A-Z]' | sed 1d | sed 2d | sed 3d | sed 4d | tr -c '[A-Z]' '\n' | tr -d '\n' + +# 9.9: +cat $IN99 | tr -c '[a-z][A-Z]' '\n' | grep '[A-Z]' | sed 1d | sed 1d | sed 2d | sed 3d | sed 5d | tr -c '[A-Z]' '\n' | tr -d '\n' + +# 10.1: count Turing award recipients while working at Bell Labs +cat $IN10 | sed 1d | grep 'Bell' | cut -f 2 | wc -l + +# 10.2: list Turing award recipients while working at Bell Labs +cat $IN10 | sed 1d | grep 'Bell' | cut -f 2 + +# 10.3: extract Ritchie's username +cat $IN10 | grep 'Bell' | cut -f 2 | head -n 1 | fmt -w1 | cut -c 1-1 | tr -d '\n' | tr '[A-Z]' '[a-z]' + +# 11.1: year Ritchie and Thompson receive the Hamming medal +cat $IN11 | grep 'UNIX' | cut -f 1 + +# 11.2: most repeated first name in the list? +cat $IN11 | cut -f 2 | cut -d ' ' -f 1 | sort | uniq -c | sort -nr | head -n 1 | fmt -w1 | sed 1d + + +# # 12.1: transform this list of instructions such that if the snake follows the +# # new instructions top to bottom, it ends on the location of the apple. +# cat $IN12 > /dev/null diff --git a/evaluation/distr_benchmarks/unix50/run.distr.sh b/evaluation/distr_benchmarks/unix50/run.distr.sh new file mode 100755 index 000000000..2526bbbe4 --- /dev/null +++ b/evaluation/distr_benchmarks/unix50/run.distr.sh @@ -0,0 +1,78 @@ +PASH_FLAGS='--width 8 --r_split' +export TIMEFORMAT=%R + +if [[ "$1" == "--extended" ]]; then + echo "Using extended input" + export IN_PRE=/unix50/extended_input + else + export IN_PRE=/unix50 +fi + +unix50_bash(){ + times_file="seq.res" + outputs_suffix="seq.out" + outputs_dir="outputs" + + mkdir -p "$outputs_dir" + + touch "$times_file" + cat "$times_file" > "$times_file".d + echo executing Unix50 $(date) | tee "$times_file" + echo '' >> "$times_file" + + for number in `seq 36` + do + script="${number}" + + printf -v pad %20s + padded_script="${script}.sh:${pad}" + padded_script=${padded_script:0:20} + + outputs_file="${outputs_dir}/${script}.${outputs_suffix}" + + echo "${padded_script}" $({ time ./${script}.sh > "$outputs_file"; } 2>&1) | tee -a "$times_file" + done +} + + +unix50_pash(){ + flags=${1:-$PASH_FLAGS} + prefix=${2:-par} + + times_file="$prefix.res" + outputs_suffix="$prefix.out" + time_suffix="$prefix.time" + outputs_dir="outputs" + pash_logs_dir="pash_logs_$prefix" + + mkdir -p "$outputs_dir" + mkdir -p "$pash_logs_dir" + + touch "$times_file" + cat "$times_file" > "$times_file".d + echo executing Unix50 $(date) | tee "$times_file" + echo '' >> "$times_file" + + for number in `seq 36` + do + script="${number}" + + printf -v pad %20s + padded_script="${script}.sh:${pad}" + padded_script=${padded_script:0:20} + + outputs_file="${outputs_dir}/${script}.${outputs_suffix}" + pash_log="${pash_logs_dir}/${script}.pash.log" + single_time_file="${outputs_dir}/${script}.${time_suffix}" + + echo -n "${padded_script}" | tee -a "$times_file" + { time "$PASH_TOP/pa.sh" $flags --log_file "${pash_log}" ${script}.sh > "$outputs_file"; } 2> "${single_time_file}" + cat "${single_time_file}" | tee -a "$times_file" + done +} + +unix50_bash + +unix50_pash "$PASH_FLAGS" "par" + +unix50_pash "$PASH_FLAGS --distributed_exec" "distr" From be3fae29b6dd5e460a0979b6f70951a764420219 Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Thu, 2 Jun 2022 23:58:11 -0600 Subject: [PATCH 19/37] ported analytics mts to distributed exec --- .../distr_benchmarks/analytics-mts/1.sh | 21 ++++++ .../distr_benchmarks/analytics-mts/2.sh | 22 ++++++ .../distr_benchmarks/analytics-mts/3.sh | 22 ++++++ .../distr_benchmarks/analytics-mts/4.sh | 21 ++++++ .../distr_benchmarks/analytics-mts/5.sh | 18 +++++ .../distr_benchmarks/analytics-mts/README.md | 10 +++ .../analytics-mts/input/.gitignore | 5 ++ .../analytics-mts/input/setup.sh | 43 +++++++++++ .../analytics-mts/run-experiment.sh | 36 +++++++++ .../analytics-mts/run.distr.sh | 75 +++++++++++++++++++ 10 files changed, 273 insertions(+) create mode 100755 evaluation/distr_benchmarks/analytics-mts/1.sh create mode 100755 evaluation/distr_benchmarks/analytics-mts/2.sh create mode 100755 evaluation/distr_benchmarks/analytics-mts/3.sh create mode 100755 evaluation/distr_benchmarks/analytics-mts/4.sh create mode 100755 evaluation/distr_benchmarks/analytics-mts/5.sh create mode 100644 evaluation/distr_benchmarks/analytics-mts/README.md create mode 100644 evaluation/distr_benchmarks/analytics-mts/input/.gitignore create mode 100755 evaluation/distr_benchmarks/analytics-mts/input/setup.sh create mode 100755 evaluation/distr_benchmarks/analytics-mts/run-experiment.sh create mode 100755 evaluation/distr_benchmarks/analytics-mts/run.distr.sh diff --git a/evaluation/distr_benchmarks/analytics-mts/1.sh b/evaluation/distr_benchmarks/analytics-mts/1.sh new file mode 100755 index 000000000..746087898 --- /dev/null +++ b/evaluation/distr_benchmarks/analytics-mts/1.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Vehicles on the road per day + +# out1 + +# curl https://balab.aueb.gr/~dds/oasa-$(date --date='1 days ago' +'%y-%m-%d').bz2 | +# bzip2 -d | # decompress +# Replace the line below with the two lines above to stream the latest file +cat $IN | # assumes saved input + sed 's/T..:..:..//' | # hide times + cut -d ',' -f 1,3 | # keep only day and bus no + sort -u | # remove duplicate records due to time + cut -d ',' -f 1 | # keep all dates + sort | # preparing for uniq + uniq -c | # count unique dates + awk -v OFS="\t" "{print \$2,\$1}" # print first date, then count + +# diff out{1,} diff --git a/evaluation/distr_benchmarks/analytics-mts/2.sh b/evaluation/distr_benchmarks/analytics-mts/2.sh new file mode 100755 index 000000000..9de4272f8 --- /dev/null +++ b/evaluation/distr_benchmarks/analytics-mts/2.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# Days a vehicle is on the road + +# out1 + +# curl https://balab.aueb.gr/~dds/oasa-$(date --date='1 days ago' +'%y-%m-%d').bz2 | +# bzip2 -d | # decompress +# Replace the line below with the two lines above to stream the latest file +cat $IN | # assumes saved input + sed 's/T..:..:..//' | # hide times + cut -d ',' -f 3,1 | # keep only day and bus ID + sort -u | # removing duplicate day-buses + cut -d ',' -f 2 | # keep only bus ID + sort | # preparing for uniq + uniq -c | # count unique dates + sort -k1n | # sort in reverse numerical order + awk -v OFS="\t" "{print \$2,\$1}" # print first date, then count + +# diff out{1,} diff --git a/evaluation/distr_benchmarks/analytics-mts/3.sh b/evaluation/distr_benchmarks/analytics-mts/3.sh new file mode 100755 index 000000000..d1bd67024 --- /dev/null +++ b/evaluation/distr_benchmarks/analytics-mts/3.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# Hours each vehicle is on the road + +# out1 + +# curl https://balab.aueb.gr/~dds/oasa-$(date --date='1 days ago' +'%y-%m-%d').bz2 | +# bzip2 -d | # decompress +# Replace the line below with the two lines above to stream the latest file +cat $IN | # assumes saved input + sed 's/T\(..\):..:../,\1/' | # keep times only + cut -d ',' -f 1,2,4 | # keep only time date and bus id + sort -u | # removing duplicate entries + cut -d ',' -f 3 | # keep only bus ID + sort | # preparing for uniq + uniq -c | # count hours per bus + sort -k1n | # sort in reverse numerical order + awk -v OFS="\t" "{print \$2,\$1}" # print first date, then count + +# diff out{1,} diff --git a/evaluation/distr_benchmarks/analytics-mts/4.sh b/evaluation/distr_benchmarks/analytics-mts/4.sh new file mode 100755 index 000000000..e77f8efdf --- /dev/null +++ b/evaluation/distr_benchmarks/analytics-mts/4.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Hours monitored each day + +# out diff --git a/evaluation/distr_benchmarks/analytics-mts/README.md b/evaluation/distr_benchmarks/analytics-mts/README.md new file mode 100644 index 000000000..f6edda06c --- /dev/null +++ b/evaluation/distr_benchmarks/analytics-mts/README.md @@ -0,0 +1,10 @@ +# Mass-Transport System Analytics + +This set of scripts script is part of [a recent study on OASA](https://insidestory.gr/article/noymera-leoforeia-athinas) from Diomidis Spinellis and Eleftheria Tsaliki. OASA is the the mass-transport system supporting the city of Athens. + +1. `1.sh`: Vehicles on the road per day +2. `2.sh`: Days a vehicle is on the road +3. `3.sh`: Hours each vehicle is on the road +4. `4.sh`: Hours monitored each day +5. `5.sh`: Hours each bus is active each day + diff --git a/evaluation/distr_benchmarks/analytics-mts/input/.gitignore b/evaluation/distr_benchmarks/analytics-mts/input/.gitignore new file mode 100644 index 000000000..f264282d2 --- /dev/null +++ b/evaluation/distr_benchmarks/analytics-mts/input/.gitignore @@ -0,0 +1,5 @@ +./oasa-2021-01-08.bz2 +in*.csv +./out +./out1 +*.out diff --git a/evaluation/distr_benchmarks/analytics-mts/input/setup.sh b/evaluation/distr_benchmarks/analytics-mts/input/setup.sh new file mode 100755 index 000000000..f010ef168 --- /dev/null +++ b/evaluation/distr_benchmarks/analytics-mts/input/setup.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +# #Check that we are in the appropriate directory where setup.sh is +# #https://stackoverflow.com/a/246128 +# DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +# echo "changing to $DIR to run setup.sh" +# cd $DIR + +PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)} + +if [[ "$1" == "-c" ]]; then + rm -f *.bz2 'in.csv' 'in_small.csv' + exit +fi + +setup_dataset() { + hdfs dfs -mkdir /analytics-mts + if [ ! -f ./in.csv ] && [ "$1" = "--full" ]; then + # yesterday=$(date --date='1 days ago' +'%y-%m-%d') + # curl https://www.balab.aueb.gr/~dds/oasa-$yesterday.bz2 | + curl -sf 'https://www.balab.aueb.gr/~dds/oasa-2021-01-08.bz2' | bzip2 -d > in.csv + if [ $? -ne 0 ]; then + echo "oasa-2021-01-08.bz2 / bzip2 not available, contact the pash authors" + exit 1 + fi + hdfs dfs -put in.csv /analytics-mts/in.csv + elif [ ! -f ./in_small.csv ] && [ "$1" = "--small" ]; then + if [ ! -f ./in_small.csv ]; then + echo "Generating small-size inputs" + # FIXME PR: Do we need all of them? + curl -sf 'http://pac-n4.csail.mit.edu:81/pash_data/small/in_small.csv' > in_small.csv + fi + hdfs dfs -put in_small.csv /analytics-mts/in_small.csv + fi +} + +source_var() { + if [[ "$1" == "--small" ]]; then + export IN="input/in_small.csv" + else + export IN="input/in.csv" + fi +} diff --git a/evaluation/distr_benchmarks/analytics-mts/run-experiment.sh b/evaluation/distr_benchmarks/analytics-mts/run-experiment.sh new file mode 100755 index 000000000..ca0a0c010 --- /dev/null +++ b/evaluation/distr_benchmarks/analytics-mts/run-experiment.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash + +export PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel --show-superproject-working-tree)} + +eval_dir="$PASH_TOP/evaluation/buses/" +results_dir="${eval_dir}/results/" + +mkdir -p $results_dir + +for i in 1 2 3 4 +do + script="${eval_dir}/${i}.sh" + echo "Executing $script..." + + seq_output=/tmp/seq_output + pash_width_16_no_cat_split_output=/tmp/pash_16_no_cat_split_output + pash_width_16_output=/tmp/pash_16_output + + seq_time="${results_dir}/${i}_2_seq.time" + pash_width_16_no_cat_split_time="${results_dir}/${i}_16_distr_auto_split_fan_in_fan_out.time" + pash_width_16_time="${results_dir}/${i}_16_distr_auto_split.time" + + echo "Executing the script with bash..." + { time /bin/bash $script > $seq_output ; } 2> >(tee "${seq_time}" >&2) + + echo "Executing the script with pash -w 16 without the cat-split optimization (log in: /tmp/pash_16_log)" + { time $PASH_TOP/pa.sh -w 16 -d 1 --log_file /tmp/pash_16_no_cat_split_log --no_cat_split_vanish --output_time $script ; } 1> "$pash_width_16_no_cat_split_output" 2> >(tee "${pash_width_16_no_cat_split_time}" >&2) + echo "Checking for output equivalence..." + diff -s $seq_output $pash_width_16_no_cat_split_output | head + + echo "Executing the script with pash -w 16 (log in: /tmp/pash_16_log)" + { time $PASH_TOP/pa.sh -w 16 -d 1 --log_file /tmp/pash_16_log --output_time $script ; } 1> "$pash_width_16_output" 2> >(tee "${pash_width_16_time}" >&2) + echo "Checking for output equivalence..." + diff -s $seq_output $pash_width_16_output | head + +done diff --git a/evaluation/distr_benchmarks/analytics-mts/run.distr.sh b/evaluation/distr_benchmarks/analytics-mts/run.distr.sh new file mode 100755 index 000000000..4823a1f61 --- /dev/null +++ b/evaluation/distr_benchmarks/analytics-mts/run.distr.sh @@ -0,0 +1,75 @@ +PASH_FLAGS='--width 8 --r_split' +export TIMEFORMAT=%R + +if [[ "$1" == "--small" ]]; then + export IN="input/in_small.csv" +else + export IN="input/in.csv" +fi + +analytics-mts_bash(){ + times_file="seq.res" + outputs_suffix="seq.out" + outputs_dir="outputs" + + mkdir -p "$outputs_dir" + + touch "$times_file" + cat "$times_file" > "$times_file".d + echo executing MTS analytics $(date) | tee "$times_file" + echo '' >> "$times_file" + ## FIXME 5.sh is not working yet + for number in `seq 4` + do + script="${number}" + + printf -v pad %20s + padded_script="${script}.sh:${pad}" + padded_script=${padded_script:0:20} + # select the respective input + outputs_file="${outputs_dir}/${script}.${outputs_suffix}" + + echo "${padded_script}" $({ time ./${script}.sh > "$outputs_file"; } 2>&1) | tee -a "$times_file" + done +} + +analytics-mts_pash(){ + flags=${1:-$PASH_FLAGS} + prefix=${2:-par} + + times_file="$prefix.res" + outputs_suffix="$prefix.out" + time_suffix="$prefix.time" + outputs_dir="outputs" + pash_logs_dir="pash_logs_$prefix" + + mkdir -p "$outputs_dir" + mkdir -p "$pash_logs_dir" + + touch "$times_file" + cat "$times_file" > "$times_file".d + echo executing MTS analytics with pash $(date) | tee "$times_file" + echo '' >> "$times_file" + ## FIXME 5.sh is not working yet + for number in `seq 4` + do + script="${number}" + + printf -v pad %20s + padded_script="${script}.sh:${pad}" + padded_script=${padded_script:0:20} + outputs_file="${outputs_dir}/${script}.${outputs_suffix}" + pash_log="${pash_logs_dir}/${script}.pash.log" + single_time_file="${outputs_dir}/${script}.${time_suffix}" + + echo -n "${padded_script}" | tee -a "$times_file" + { time "$PASH_TOP/pa.sh" $flags --log_file "${pash_log}" ${script}.sh > "$outputs_file"; } 2> "${single_time_file}" + cat "${single_time_file}" | tee -a "$times_file" + done +} + +analytics-mts_bash + +analytics-mts_pash "$PASH_FLAGS" "par" + +analytics-mts_pash "$PASH_FLAGS --distributed_exec" "distr" From 170e7cd1ea54f25628d33014a1f8ef5cb9573250 Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Fri, 3 Jun 2022 00:02:18 -0600 Subject: [PATCH 20/37] added gitingore --- evaluation/distr_benchmarks/.gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/evaluation/distr_benchmarks/.gitignore b/evaluation/distr_benchmarks/.gitignore index e6d35e74c..e9dd79c07 100644 --- a/evaluation/distr_benchmarks/.gitignore +++ b/evaluation/distr_benchmarks/.gitignore @@ -1 +1,3 @@ -outputs \ No newline at end of file +outputs +*.res.* +*.txt \ No newline at end of file From 017ef5598fcd2768c9acb5c214ba6d5e6cd26d77 Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Fri, 3 Jun 2022 02:38:58 -0400 Subject: [PATCH 21/37] fix trigrams nlp --- evaluation/distr_benchmarks/nlp/6_1.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/evaluation/distr_benchmarks/nlp/6_1.sh b/evaluation/distr_benchmarks/nlp/6_1.sh index 5b4181fb2..1adaca799 100755 --- a/evaluation/distr_benchmarks/nlp/6_1.sh +++ b/evaluation/distr_benchmarks/nlp/6_1.sh @@ -7,7 +7,7 @@ OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/6_1/} ENTRIES=${ENTRIES:-1060} mkdir -p "$OUT" -trigrams() { +pure_func() { input=$1 TEMPDIR=$(mktemp -d) tr -sc '[A-Z][a-z]' '[\012*]' > ${TEMPDIR}/${input}.words @@ -16,12 +16,12 @@ trigrams() { paste ${TEMPDIR}/${input}.words ${TEMPDIR}/${input}.nextwords ${TEMPDIR}/${input}.nextwords2 | sort | uniq -c rm -rf ${TEMPDIR} } -export -f trigrams +export -f pure_func for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) do - hdfs dfs -cat $IN"/"$input | grep 'the land of' | trigrams ${input} | sort -nr | sed 5q > ${OUT}/${input}.out0 - hdfs dfs -cat $IN"/"$input | grep 'And he said' | trigrams ${input} | sort -nr | sed 5q > ${OUT}/${input}.out1 + hdfs dfs -cat $IN"/"$input | grep 'the land of' | pure_func ${input} | sort -nr | sed 5q > ${OUT}/${input}.out0 + hdfs dfs -cat $IN"/"$input | grep 'And he said' | pure_func ${input} | sort -nr | sed 5q > ${OUT}/${input}.out1 done echo 'done'; From 039334ba700c8080414fd7f9edf4c7f948b965ac Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Mon, 6 Jun 2022 16:08:06 -0400 Subject: [PATCH 22/37] some fixes --- evaluation/distr_benchmarks/analytics-mts/1.sh | 2 +- evaluation/distr_benchmarks/analytics-mts/2.sh | 2 +- evaluation/distr_benchmarks/analytics-mts/3.sh | 2 +- evaluation/distr_benchmarks/analytics-mts/4.sh | 2 +- evaluation/distr_benchmarks/analytics-mts/input/setup.sh | 6 +++--- evaluation/distr_benchmarks/analytics-mts/run.distr.sh | 4 ++-- evaluation/distr_benchmarks/nlp/6_1.sh | 4 ++-- evaluation/distr_benchmarks/nlp/run.distr.sh | 4 ++-- evaluation/distr_benchmarks/unix50/input/setup.sh | 2 +- 9 files changed, 14 insertions(+), 14 deletions(-) diff --git a/evaluation/distr_benchmarks/analytics-mts/1.sh b/evaluation/distr_benchmarks/analytics-mts/1.sh index 746087898..1ce28dc4a 100755 --- a/evaluation/distr_benchmarks/analytics-mts/1.sh +++ b/evaluation/distr_benchmarks/analytics-mts/1.sh @@ -9,7 +9,7 @@ # curl https://balab.aueb.gr/~dds/oasa-$(date --date='1 days ago' +'%y-%m-%d').bz2 | # bzip2 -d | # decompress # Replace the line below with the two lines above to stream the latest file -cat $IN | # assumes saved input +hdfs dfs -cat $IN | # assumes saved input sed 's/T..:..:..//' | # hide times cut -d ',' -f 1,3 | # keep only day and bus no sort -u | # remove duplicate records due to time diff --git a/evaluation/distr_benchmarks/analytics-mts/2.sh b/evaluation/distr_benchmarks/analytics-mts/2.sh index 9de4272f8..59abd8de4 100755 --- a/evaluation/distr_benchmarks/analytics-mts/2.sh +++ b/evaluation/distr_benchmarks/analytics-mts/2.sh @@ -9,7 +9,7 @@ # curl https://balab.aueb.gr/~dds/oasa-$(date --date='1 days ago' +'%y-%m-%d').bz2 | # bzip2 -d | # decompress # Replace the line below with the two lines above to stream the latest file -cat $IN | # assumes saved input +hdfs dfs -cat $IN | # assumes saved input sed 's/T..:..:..//' | # hide times cut -d ',' -f 3,1 | # keep only day and bus ID sort -u | # removing duplicate day-buses diff --git a/evaluation/distr_benchmarks/analytics-mts/3.sh b/evaluation/distr_benchmarks/analytics-mts/3.sh index d1bd67024..829442fc9 100755 --- a/evaluation/distr_benchmarks/analytics-mts/3.sh +++ b/evaluation/distr_benchmarks/analytics-mts/3.sh @@ -9,7 +9,7 @@ # curl https://balab.aueb.gr/~dds/oasa-$(date --date='1 days ago' +'%y-%m-%d').bz2 | # bzip2 -d | # decompress # Replace the line below with the two lines above to stream the latest file -cat $IN | # assumes saved input +hdfs dfs -cat $IN | # assumes saved input sed 's/T\(..\):..:../,\1/' | # keep times only cut -d ',' -f 1,2,4 | # keep only time date and bus id sort -u | # removing duplicate entries diff --git a/evaluation/distr_benchmarks/analytics-mts/4.sh b/evaluation/distr_benchmarks/analytics-mts/4.sh index e77f8efdf..36c4010bc 100755 --- a/evaluation/distr_benchmarks/analytics-mts/4.sh +++ b/evaluation/distr_benchmarks/analytics-mts/4.sh @@ -9,7 +9,7 @@ # curl https://balab.aueb.gr/~dds/oasa-$(date --date='1 days ago' +'%y-%m-%d').bz2 | # bzip2 -d | # decompress # Replace the line below with the two lines above to stream the latest file -cat $IN | # assumes saved input +hdfs dfs -cat $IN | # assumes saved input sed 's/T\(..\):..:../,\1/' | # keep times only cut -d ',' -f 1,2 | # keep only time and date sort -u | # removing duplicate entries diff --git a/evaluation/distr_benchmarks/analytics-mts/input/setup.sh b/evaluation/distr_benchmarks/analytics-mts/input/setup.sh index f010ef168..7dc7d067f 100755 --- a/evaluation/distr_benchmarks/analytics-mts/input/setup.sh +++ b/evaluation/distr_benchmarks/analytics-mts/input/setup.sh @@ -15,7 +15,7 @@ fi setup_dataset() { hdfs dfs -mkdir /analytics-mts - if [ ! -f ./in.csv ] && [ "$1" = "--full" ]; then + if [ ! -f ./in.csv ] && [ "$1" != "--small" ];; then # yesterday=$(date --date='1 days ago' +'%y-%m-%d') # curl https://www.balab.aueb.gr/~dds/oasa-$yesterday.bz2 | curl -sf 'https://www.balab.aueb.gr/~dds/oasa-2021-01-08.bz2' | bzip2 -d > in.csv @@ -36,8 +36,8 @@ setup_dataset() { source_var() { if [[ "$1" == "--small" ]]; then - export IN="input/in_small.csv" + export IN="analytics-mts/in_small.csv" else - export IN="input/in.csv" + export IN="analytics-mts/in.csv" fi } diff --git a/evaluation/distr_benchmarks/analytics-mts/run.distr.sh b/evaluation/distr_benchmarks/analytics-mts/run.distr.sh index 4823a1f61..9426fcbd9 100755 --- a/evaluation/distr_benchmarks/analytics-mts/run.distr.sh +++ b/evaluation/distr_benchmarks/analytics-mts/run.distr.sh @@ -2,9 +2,9 @@ PASH_FLAGS='--width 8 --r_split' export TIMEFORMAT=%R if [[ "$1" == "--small" ]]; then - export IN="input/in_small.csv" + export IN="/analytics-mts/in_small.csv" else - export IN="input/in.csv" + export IN="/analytics-mts/in.csv" fi analytics-mts_bash(){ diff --git a/evaluation/distr_benchmarks/nlp/6_1.sh b/evaluation/distr_benchmarks/nlp/6_1.sh index 1adaca799..d0cea8ad9 100755 --- a/evaluation/distr_benchmarks/nlp/6_1.sh +++ b/evaluation/distr_benchmarks/nlp/6_1.sh @@ -20,8 +20,8 @@ export -f pure_func for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) do - hdfs dfs -cat $IN"/"$input | grep 'the land of' | pure_func ${input} | sort -nr | sed 5q > ${OUT}/${input}.out0 - hdfs dfs -cat $IN"/"$input | grep 'And he said' | pure_func ${input} | sort -nr | sed 5q > ${OUT}/${input}.out1 + hdfs dfs -cat $IN/$input | grep 'the land of' | pure_func ${input} | sort -nr | sed 5q > ${OUT}/${input}.out0 + hdfs dfs -cat $IN/$input | grep 'And he said' | pure_func ${input} | sort -nr | sed 5q > ${OUT}/${input}.out1 done echo 'done'; diff --git a/evaluation/distr_benchmarks/nlp/run.distr.sh b/evaluation/distr_benchmarks/nlp/run.distr.sh index b693d5065..c1285f73a 100755 --- a/evaluation/distr_benchmarks/nlp/run.distr.sh +++ b/evaluation/distr_benchmarks/nlp/run.distr.sh @@ -14,10 +14,10 @@ names_scripts=( "2syllable_words;6_5" "4letter_words;6_2" "bigrams_appear_twice;8.2_2" - # "bigrams;4_3" + "bigrams;4_3" "compare_exodus_genesis;8.3_3" "count_consonant_seq;7_2" - # "count_morphs;7_1" + "count_morphs;7_1" "count_trigrams;4_3b" "count_vowel_seq;2_2" "count_words;1_1" diff --git a/evaluation/distr_benchmarks/unix50/input/setup.sh b/evaluation/distr_benchmarks/unix50/input/setup.sh index 01d7aaa21..68b831d82 100755 --- a/evaluation/distr_benchmarks/unix50/input/setup.sh +++ b/evaluation/distr_benchmarks/unix50/input/setup.sh @@ -42,7 +42,7 @@ setup_dataset() { wget "http://ndr.md/data/unix50/${input}.txt" "$PASH_TOP/scripts/append_nl_if_not.sh" "${input}.txt" fi - hdfs dfs -put $file /unix50/$file + hdfs dfs -put "${input}.txt" /unix50/"${input}.txt" done # increase the original input size 10x From 99c3dbb85a811f2253ae5be3891770c5c30cd43e Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Mon, 6 Jun 2022 17:24:33 -0400 Subject: [PATCH 23/37] port some dependency untagling scripts to hdfs --- .../dependency_untangling/compress_files.sh | 20 +- .../dependency_untangling/encrypt_files.sh | 20 +- .../dependency_untangling/img_convert.sh | 12 +- .../input/install-deps.sh | 45 +-- .../dependency_untangling/input/setup.sh | 261 +++++++++--------- .../dependency_untangling/nginx.sh | 34 +-- .../dependency_untangling/pcap.sh | 29 +- .../dependency_untangling/run.distr.sh | 84 +++--- .../dependency_untangling/to_mp3.sh | 10 +- 9 files changed, 263 insertions(+), 252 deletions(-) diff --git a/evaluation/distr_benchmarks/dependency_untangling/compress_files.sh b/evaluation/distr_benchmarks/dependency_untangling/compress_files.sh index 652ce1969..d7c331e84 100755 --- a/evaluation/distr_benchmarks/dependency_untangling/compress_files.sh +++ b/evaluation/distr_benchmarks/dependency_untangling/compress_files.sh @@ -1,21 +1,19 @@ #!/bin/bash # compress all files in a directory -IN=${IN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/pcap_data/} +IN=${IN:-/dependency_untangling/pcap_data/} OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/compress} -LOGS=${OUT}/logs -mkdir -p ${OUT}/logs -run_tests() { - name=$(basename $1).zip - zip -r ${OUT}/$name $1 + +mkdir -p ${OUT} +pure_func() { + zip -r -- } -export -f run_tests +export -f pure_func -pkg_count=0 -for item in ${IN}/*; +for item in $(hdfs dfs -ls -C ${IN}); do - pkg_count=$((pkg_count + 1)); - run_tests $item > "${LOGS}"/"$pkg_count.log" + output_name=$(basename $item).zip + hdfs dfs -cat $item | pure_func > $OUT/$output_name done echo 'done'; diff --git a/evaluation/distr_benchmarks/dependency_untangling/encrypt_files.sh b/evaluation/distr_benchmarks/dependency_untangling/encrypt_files.sh index 421732513..dfec87ea9 100755 --- a/evaluation/distr_benchmarks/dependency_untangling/encrypt_files.sh +++ b/evaluation/distr_benchmarks/dependency_untangling/encrypt_files.sh @@ -1,20 +1,18 @@ #!/bin/bash # encrypt all files in a directory -IN=${IN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/pcap_data} +IN=${IN:-/dependency_untangling/pcap_data} OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/encrypt} -LOGS=${OUT}/logs -mkdir -p ${LOGS} -run_tests() { - openssl enc -aes-256-cbc -pbkdf2 -iter 20000 -in $1 -out $OUT/$(basename $1).enc -k 'key' -} +mkdir -p ${OUT} -export -f run_tests -pkg_count=0 +pure_func() { + openssl enc -aes-256-cbc -pbkdf2 -iter 20000 -k 'key' +} +export -f pure_func -for item in ${IN}/*; +for item in $(hdfs dfs -ls -C ${IN}); do - pkg_count=$((pkg_count + 1)); - run_tests $item > ${LOGS}/${pkg_count}.log + output_name=$(basename $item).enc + hdfs dfs -cat $item | pure_func > $OUT/$output_name done echo 'done'; diff --git a/evaluation/distr_benchmarks/dependency_untangling/img_convert.sh b/evaluation/distr_benchmarks/dependency_untangling/img_convert.sh index 2b87d0528..d38e474f2 100755 --- a/evaluation/distr_benchmarks/dependency_untangling/img_convert.sh +++ b/evaluation/distr_benchmarks/dependency_untangling/img_convert.sh @@ -1,12 +1,18 @@ #!/bin/bash # tag: resize image -IN=${JPG:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/jpg} +IN=${JPG:-/dependency_untangling/jpg} OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/jpg} mkdir -p ${OUT} -for i in $IN/*.jpg; + +pure_func () { + convert -resize 70% "-" "-" +} +export -f pure_func + +for i in $(hdfs dfs -ls -C ${IN}/*.jpg); do out=$OUT/$(basename -- $i) - convert -resize 70% "$i" "$out"; + hdfs dfs -cat $i | pure_func > $out; done echo 'done'; diff --git a/evaluation/distr_benchmarks/dependency_untangling/input/install-deps.sh b/evaluation/distr_benchmarks/dependency_untangling/input/install-deps.sh index 4cb9e845a..3d4a75b1a 100755 --- a/evaluation/distr_benchmarks/dependency_untangling/input/install-deps.sh +++ b/evaluation/distr_benchmarks/dependency_untangling/input/install-deps.sh @@ -8,27 +8,28 @@ if ! dpkg -s $pkgs >/dev/null 2>&1 ; then echo 'Packages Installed' fi -if [ ! -d ${IN}/deps/samtools-1.7 ]; then - cd ${IN}/deps/ - wget https://github.com/samtools/samtools/archive/refs/tags/1.7.zip - unzip 1.7.zip - rm 1.7.zip - cd samtools-1.7 - wget https://github.com/samtools/htslib/archive/refs/tags/1.7.zip - unzip 1.7.zip - autoheader # Build config.h.in (this may generate a warning about - # AC_CONFIG_SUBDIRS - please ignore it). - autoconf -Wno-syntax # Generate the configure script - ./configure # Needed for choosing optional functionality - make - rm -rf 1.7.zip - echo 'Samtools installed' -fi +# NOT used +# if [ ! -d ${IN}/deps/samtools-1.7 ]; then +# cd ${IN}/deps/ +# wget https://github.com/samtools/samtools/archive/refs/tags/1.7.zip +# unzip 1.7.zip +# rm 1.7.zip +# cd samtools-1.7 +# wget https://github.com/samtools/htslib/archive/refs/tags/1.7.zip +# unzip 1.7.zip +# autoheader # Build config.h.in (this may generate a warning about +# # AC_CONFIG_SUBDIRS - please ignore it). +# autoconf -Wno-syntax # Generate the configure script +# ./configure # Needed for choosing optional functionality +# make +# rm -rf 1.7.zip +# echo 'Samtools installed' +# fi -if [ ! -f ${IN}/deps/makedeb.deb ]; then - cd ${IN}/deps/ - wget http://pac-n4.csail.mit.edu:81/pash_data/makedeb.deb - sudo dpkg -i makedeb.deb - echo 'Makedeb installed' -fi +# if [ ! -f ${IN}/deps/makedeb.deb ]; then +# cd ${IN}/deps/ +# wget http://pac-n4.csail.mit.edu:81/pash_data/makedeb.deb +# sudo dpkg -i makedeb.deb +# echo 'Makedeb installed' +# fi diff --git a/evaluation/distr_benchmarks/dependency_untangling/input/setup.sh b/evaluation/distr_benchmarks/dependency_untangling/input/setup.sh index 58ee4bd7d..d3baf70ca 100755 --- a/evaluation/distr_benchmarks/dependency_untangling/input/setup.sh +++ b/evaluation/distr_benchmarks/dependency_untangling/input/setup.sh @@ -25,136 +25,141 @@ if [ "$1" == "-c" ]; then fi setup_dataset() { - if [ "$1" == "--small" ]; then - LOG_DATA_FILES=6 - WAV_DATA_FILES=20 - NODE_MODULE_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/node_modules.zip - BIO_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/bio.zip - JPG_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/jpg.zip - PCAP_DATA_FILES=1 - else - LOG_DATA_FILES=84 - WAV_DATA_FILES=120 - NODE_MODULE_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/node_modules.zip - BIO_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/bio.zip - JPG_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/jpg.zip - PCAP_DATA_FILES=15 - fi - - if [ ! -d ${IN}/wav ]; then - wget http://pac-n4.csail.mit.edu:81/pash_data/wav.zip - unzip wav.zip && cd wav/ - for f in *.wav; do - FILE=$(basename "$f") - for (( i = 0; i <= $WAV_DATA_FILES; i++)) do - echo copying to $f$i.wav - cp $f $f$i.wav - done - done - echo "WAV Generated" - fi - - if [ ! -d ${IN}/jpg ]; then - cd ${IN} - wget $JPG_DATA_LINK - unzip jpg.zip - echo "JPG Generated" - rm -rf ${IN}/jpg.zip - fi - - # download the input for the nginx logs and populate the dataset - if [ ! -d ${IN}/log_data ]; then - cd $IN - wget http://pac-n4.csail.mit.edu:81/pash_data/nginx.zip - unzip nginx.zip - rm nginx.zip - # generating analysis logs - mkdir -p ${IN}/log_data - for (( i = 1; i <=$LOG_DATA_FILES; i++)) do - for j in nginx-logs/*;do - n=$(basename $j) - cat $j > log_data/log${i}_${n}.log; - done - done - echo "Logs Generated" - fi - - if [ ! -d ${IN}/bio ]; then - if [ "$1" = "--small" ]; then - # download the Genome loc file - wget $BIO_DATA_LINK - unzip bio.zip - cd bio - wget http://pac-n4.csail.mit.edu:81/pash_data/Gene_locs.txt - wget http://pac-n4.csail.mit.edu:81/pash_data/small/100G.txt - cd .. - rm bio.zip - else - mkdir ${IN}/bio - cd ${IN}/bio - # download the file containing the links for the dataset - wget http://pac-n4.csail.mit.edu:81/pash_data/100G.txt - # download the Genome loc file - wget http://pac-n4.csail.mit.edu:81/pash_data/Gene_locs.txt - # start downloading the real dataset - cat ${IN_NAME} |while read s_line; - do - echo ${IN_NAME} - sample=$(echo $s_line |cut -d " " -f 2); - if [[ ! -f $sample ]]; then - pop=$(echo $s_line |cut -f 1 -d " "); - link=$(echo $s_line |cut -f 3 -d " "); - wget -O "$sample".bam "$link"; ##this part can be adjusted maybe - fi - done; - fi - echo "Genome data downloaded" - fi - - # download the initial pcaps to populate the whole dataset - if [ ! -d ${IN}/pcap_data ]; then - cd $IN - wget http://pac-n4.csail.mit.edu:81/pash_data/pcaps.zip - unzip pcaps.zip - rm pcaps.zip - mkdir ${IN}/pcap_data/ - # generates 20G - for (( i = 1; i <= $PCAP_DATA_FILES; i++ )) do - for j in ${IN}/pcaps/*;do - n=$(basename $j) - cat $j > pcap_data/pcap${i}_${n}; - done - done - echo "Pcaps Generated" - fi - - # download the modules for the Mir static analyses - if [ ! -d ${IN}/node_modules ]; then - cd $IN - wget $NODE_MODULE_LINK - unzip node_modules.zip - rm node_modules.zip - # download the specific mir version - wget http://pac-n4.csail.mit.edu:81/pash_data/mir-sa.zip - unzip mir-sa.zip - rm mir-sa.zip - echo "Node modules generated" - fi - - # download the packages for the package building - if [ ! -f ${IN}/packages ]; then - cd $IN - wget http://pac-n4.csail.mit.edu:81/pash_data/packages - if [ "$1" = "--small" ]; then - head -n 20 packages > p - mv p packages - fi - echo "Package datset downloaded" - fi + hdfs dfs -mkdir /dependency_untangling + + if [ "$1" == "--small" ]; then + LOG_DATA_FILES=6 + WAV_DATA_FILES=20 + NODE_MODULE_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/node_modules.zip + BIO_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/bio.zip + JPG_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/jpg.zip + PCAP_DATA_FILES=1 + else + LOG_DATA_FILES=84 + WAV_DATA_FILES=120 + NODE_MODULE_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/node_modules.zip + BIO_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/bio.zip + JPG_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/jpg.zip + PCAP_DATA_FILES=15 + fi + + if [ ! -d ${IN}/wav ]; then + wget http://pac-n4.csail.mit.edu:81/pash_data/wav.zip + unzip wav.zip && cd wav/ + for f in *.wav; do + FILE=$(basename "$f") + for (( i = 0; i <= $WAV_DATA_FILES; i++)) do + echo copying to $f$i.wav + cp $f $f$i.wav + done + done + cd .. + hdfs dfs -put wav /dependency_untangling/wav + echo "WAV Generated" + fi + + if [ ! -d ${IN}/jpg ]; then + cd ${IN} + wget $JPG_DATA_LINK + unzip jpg.zip + hdfs dfs -put jpg /dependency_untangling/jpg + echo "JPG Generated" + rm -rf ${IN}/jpg.zip + fi + + # download the input for the nginx logs and populate the dataset + if [ ! -d ${IN}/log_data ]; then + cd $IN + wget http://pac-n4.csail.mit.edu:81/pash_data/nginx.zip + unzip nginx.zip + rm nginx.zip + # generating analysis logs + mkdir -p ${IN}/log_data + for (( i = 1; i <=$LOG_DATA_FILES; i++)) do + for j in nginx-logs/*;do + n=$(basename $j) + cat $j > log_data/log${i}_${n}.log; + done + done + hdfs dfs -put log_data /dependency_untangling/log_data + echo "Logs Generated" + fi + + # if [ ! -d ${IN}/bio ]; then + # if [ "$1" = "--small" ]; then + # # download the Genome loc file + # wget $BIO_DATA_LINK + # unzip bio.zip + # cd bio + # wget http://pac-n4.csail.mit.edu:81/pash_data/Gene_locs.txt + # wget http://pac-n4.csail.mit.edu:81/pash_data/small/100G.txt + # cd .. + # rm bio.zip + # else + # mkdir ${IN}/bio + # cd ${IN}/bio + # # download the file containing the links for the dataset + # wget http://pac-n4.csail.mit.edu:81/pash_data/100G.txt + # # download the Genome loc file + # wget http://pac-n4.csail.mit.edu:81/pash_data/Gene_locs.txt + # # start downloading the real dataset + # cat ${IN_NAME} |while read s_line; + # do + # echo ${IN_NAME} + # sample=$(echo $s_line |cut -d " " -f 2); + # if [[ ! -f $sample ]]; then + # pop=$(echo $s_line |cut -f 1 -d " "); + # link=$(echo $s_line |cut -f 3 -d " "); + # wget -O "$sample".bam "$link"; ##this part can be adjusted maybe + # fi + # done; + # fi + # echo "Genome data downloaded" + # fi + + # download the initial pcaps to populate the whole dataset + if [ ! -d ${IN}/pcap_data ]; then + cd $IN + wget http://pac-n4.csail.mit.edu:81/pash_data/pcaps.zip + unzip pcaps.zip + rm pcaps.zip + mkdir ${IN}/pcap_data/ + # generates 20G + for (( i = 1; i <= $PCAP_DATA_FILES; i++ )) do + for j in ${IN}/pcaps/*;do + n=$(basename $j) + cat $j > pcap_data/pcap${i}_${n}; + done + done + hdfs dfs -put pcap_data /dependency_untangling/pcap_data + echo "Pcaps Generated" + fi + + # # download the modules for the Mir static analyses + # if [ ! -d ${IN}/node_modules ]; then + # cd $IN + # wget $NODE_MODULE_LINK + # unzip node_modules.zip + # rm node_modules.zip + # # download the specific mir version + # wget http://pac-n4.csail.mit.edu:81/pash_data/mir-sa.zip + # unzip mir-sa.zip + # rm mir-sa.zip + # echo "Node modules generated" + # fi + + # # download the packages for the package building + # if [ ! -f ${IN}/packages ]; then + # cd $IN + # wget http://pac-n4.csail.mit.edu:81/pash_data/packages + # if [ "$1" = "--small" ]; then + # head -n 20 packages > p + # mv p packages + # fi + # echo "Package datset downloaded" + # fi } source_var() { export IN= } - -setup_dataset diff --git a/evaluation/distr_benchmarks/dependency_untangling/nginx.sh b/evaluation/distr_benchmarks/dependency_untangling/nginx.sh index afd53af8e..680de995e 100755 --- a/evaluation/distr_benchmarks/dependency_untangling/nginx.sh +++ b/evaluation/distr_benchmarks/dependency_untangling/nginx.sh @@ -1,36 +1,38 @@ #!/bin/bash # tag: nginx logs -IN=${IN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/log_data} +IN=${IN:-/dependency_untangling/log_data} OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/nginx-logs} mkdir -p ${OUT} -run_tests() { - # i don't think we should assign things to $0, however, it works with both - IN=$1 - cat $IN | cut -d "\"" -f3 | cut -d ' ' -f2 | sort | uniq -c | sort -rn +pure_func() { + tempfile=$(mktemp) + + tee $tempfile | cut -d "\"" -f3 | cut -d ' ' -f2 | sort | uniq -c | sort -rn # awk alternative, too slow - awk '{print $9}' $IN | sort | uniq -c | sort -rn + awk '{print $9}' $tempfile | sort | uniq -c | sort -rn # find broken links broken links - awk '($9 ~ /404/)' $IN | awk '{print $7}' | sort | uniq -c | sort -rn + awk '($9 ~ /404/)' $tempfile | awk '{print $7}' | sort | uniq -c | sort -rn # for 502 (bad-gateway) we can run following command: - awk '($9 ~ /502/)' $IN | awk '{print $7}' | sort | uniq -c | sort -r + awk '($9 ~ /502/)' $tempfile | awk '{print $7}' | sort | uniq -c | sort -r # Who are requesting broken links (or URLs resulting in 502) - awk -F\" '($2 ~ "/wp-admin/install.php"){print $1}' $IN | awk '{print $1}' | sort | uniq -c | sort -r + awk -F\" '($2 ~ "/wp-admin/install.php"){print $1}' $tempfile | awk '{print $1}' | sort | uniq -c | sort -r # 404 for php files -mostly hacking attempts - awk '($9 ~ /404/)' $IN | awk -F\" '($2 ~ "^GET .*.php")' | awk '{print $7}' | sort | uniq -c | sort -r | head -n 20 + awk '($9 ~ /404/)' $tempfile | awk -F\" '($2 ~ "^GET .*.php")' | awk '{print $7}' | sort | uniq -c | sort -r | head -n 20 ############################## # Most requested URLs ######## - awk -F\" '{print $2}' $IN | awk '{print $2}' | sort | uniq -c | sort -r + awk -F\" '{print $2}' $tempfile | awk '{print $2}' | sort | uniq -c | sort -r # Most requested URLs containing XYZ - awk -F\" '($2 ~ "ref"){print $2}' $IN | awk '{print $2}' | sort | uniq -c | sort -r + awk -F\" '($2 ~ "ref"){print $2}' $tempfile | awk '{print $2}' | sort | uniq -c | sort -r + + rm $tempfile } +export -f pure_func -export -f run_tests -for f in ${IN}/*; do +for log in $(hdfs dfs -ls -C ${IN}); do #bash -c 'run_tests $0 $1' $f $f #> /dev/null #run_tests $f > /dev/null - logname=$OUT/$(basename $f) - run_tests $f > $logname + logname=$OUT/$(basename $log) + hdfs dfs -cat $log | pure_func > $logname done echo 'done'; diff --git a/evaluation/distr_benchmarks/dependency_untangling/pcap.sh b/evaluation/distr_benchmarks/dependency_untangling/pcap.sh index d4e1b70ea..13a0cd29e 100755 --- a/evaluation/distr_benchmarks/dependency_untangling/pcap.sh +++ b/evaluation/distr_benchmarks/dependency_untangling/pcap.sh @@ -1,25 +1,26 @@ #!/bin/bash #tag: pcap analysis -IN=${IN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/pcap_data} +IN=${IN:-/dependency_untangling/pcap_data} OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/pcap-analysis} -LOGS=${OUT}/logs -mkdir -p ${LOGS} -run_tests() { - INPUT=$1 - /usr/sbin/tcpdump -nn -r ${INPUT} -A 'port 53' 2> /dev/null | sort | uniq |grep -Ev '(com|net|org|gov|mil|arpa)' 2> /dev/null +mkdir -p $OUT + +pure_func() { + tempfile=$(mktemp) + + tee $tempfile | tcpdump -nn -r '-' -A 'port 53' 2> /dev/null | sort | uniq |grep -Ev '(com|net|org|gov|mil|arpa)' 2> /dev/null # extract URL - /usr/sbin/tcpdump -nn -r ${INPUT} -s 0 -v -n -l 2> /dev/null | egrep -i "POST /|GET /|Host:" 2> /dev/null + tcpdump -nn -r $tempfile -s 0 -v -n -l 2> /dev/null | egrep -i "POST /|GET /|Host:" 2> /dev/null # extract passwords - /usr/sbin/tcpdump -nn -r ${INPUT} -s 0 -A -n -l 2> /dev/null | egrep -i "POST /|pwd=|passwd=|password=|Host:" 2> /dev/null -} -export -f run_tests + tcpdump -nn -r $tempfile -s 0 -A -n -l 2> /dev/null | egrep -i "POST /|pwd=|passwd=|password=|Host:" 2> /dev/null -pkg_count=0 + rm -f $tempfile +} +export -f pure_func -for item in ${IN}/*; +for item in $(hdfs dfs -ls -C ${IN}); do - pkg_count=$((pkg_count + 1)); - run_tests $item > ${LOGS}/${pkg_count}.log + logname=$OUT/$(basename $item).log; + hdfs dfs -cat $item | pure_func > $logname done echo 'done'; diff --git a/evaluation/distr_benchmarks/dependency_untangling/run.distr.sh b/evaluation/distr_benchmarks/dependency_untangling/run.distr.sh index 8928ed6be..6cdaa9c84 100755 --- a/evaluation/distr_benchmarks/dependency_untangling/run.distr.sh +++ b/evaluation/distr_benchmarks/dependency_untangling/run.distr.sh @@ -1,50 +1,47 @@ -PASH_FLAGS='--width 6 --r_split' +PASH_FLAGS='--width 8 --r_split' export TIMEFORMAT=%R -export dict="$PASH_TOP/evaluation/benchmarks/oneliners/input/dict.txt" names_scripts=( "MediaConv1;img_convert" "MediaConv2;to_mp3" - "Program_Inference;proginf" + # "Program_Inference;proginf" "LogAnalysis1;nginx" "LogAnalysis2;pcap" # "Genomics_Computation;genomics" - "AurPkg;pacaur" + # "AurPkg;pacaur" "FileEnc1;compress_files" "FileEnc2;encrypt_files" ) -oneliners_bash() { - seq_times_file="seq.res" - seq_outputs_suffix="seq.out" - outputs_dir="outputs" - - mkdir -p "$outputs_dir" +dependency_untangling_bash() { + outputs_dir="outputs" + times_file="seq.res" + outputs_suffix="seq.out" - touch "$seq_times_file" - cat $seq_times_file > $seq_times_file.d - echo executing one-liners $(date) | tee -a "$seq_times_file" - echo '' > "$seq_times_file" + rm -rf input/output + mkdir -p "$outputs_dir" - for name_script in ${names_scripts[@]} - do + touch "$times_file" + cat "$times_file" > "$times_file".d + echo executing dependency_untangling $(date) | tee "$times_file" + echo '' >> "$times_file" + + export IN= + for name_script in ${names_scripts[@]} + do IFS=";" read -r -a name_script_parsed <<< "${name_script}" name="${name_script_parsed[0]}" script="${name_script_parsed[1]}" - export IN= - export OUT= - printf -v pad %30s - padded_script="${script}${pad}" + padded_script="${name}.sh:${pad}" padded_script=${padded_script:0:30} - - seq_outputs_file="${outputs_dir}/${script}.${seq_outputs_suffix}" - - echo "${padded_script}" $({ time ./${script}.sh > "$seq_outputs_file"; } 2>&1) | tee -a "$seq_times_file" - done + outputs_file="${outputs_dir}/${script}.${outputs_suffix}" + echo "${padded_script}" $({ time ./${script}.sh > "$outputs_file"; } 2>&1) | tee -a "$times_file" + done } -oneliners_pash(){ + +dependency_untangling_pash() { flags=${1:-$PASH_FLAGS} prefix=${2:-par} @@ -54,37 +51,42 @@ oneliners_pash(){ outputs_dir="outputs" pash_logs_dir="pash_logs_$prefix" + rm -rf input/output/ + mkdir -p "$outputs_dir" mkdir -p "$pash_logs_dir" touch "$times_file" - cat $times_file > $times_file.d - echo executing one-liners with $prefix pash $(date) | tee -a "$times_file" - echo '' > "$times_file" - + cat "$times_file" > "$times_file".d + echo executing dependency_untangling with pash $(date) | tee "$times_file" + echo '' >> "$times_file" + + export IN= for name_script in ${names_scripts[@]} - do + do IFS=";" read -r -a name_script_parsed <<< "${name_script}" name="${name_script_parsed[0]}" script="${name_script_parsed[1]}" - - export IN= - export OUT= - printf -v pad %30s - padded_script="${script}${pad}" + padded_script="${name}.sh:${pad}" padded_script=${padded_script:0:30} - outputs_file="${outputs_dir}/${script}.${outputs_suffix}" pash_log="${pash_logs_dir}/${script}.pash.log" single_time_file="${outputs_dir}/${script}.${time_suffix}" - + echo -n "${padded_script}" | tee -a "$times_file" { time "$PASH_TOP/pa.sh" $flags --log_file "${pash_log}" ${script}.sh > "$outputs_file"; } 2> "${single_time_file}" cat "${single_time_file}" | tee -a "$times_file" done } -# oneliners_bash -oneliners_pash "$PASH_FLAGS" "par" -# oneliners_pash "$PASH_FLAGS --distributed_exec" "distr" +dependency_untangling_bash + +dependency_untangling_pash "$PASH_FLAGS" "par_no_du" + +dependency_untangling_pash "$PASH_FLAGS --parallel_pipelines --parallel_pipelines_limit 24" "par" + +dependency_untangling_pash "$PASH_FLAGS --distributed_exec" "distr_no_du" + +dependency_untangling_pash "$PASH_FLAGS --parallel_pipelines --distributed_exec --parallel_pipelines_limit 24" "distr" + diff --git a/evaluation/distr_benchmarks/dependency_untangling/to_mp3.sh b/evaluation/distr_benchmarks/dependency_untangling/to_mp3.sh index c94a75b49..3b0187d14 100755 --- a/evaluation/distr_benchmarks/dependency_untangling/to_mp3.sh +++ b/evaluation/distr_benchmarks/dependency_untangling/to_mp3.sh @@ -1,17 +1,15 @@ #!/bin/bash # tag: wav-to-mp3 -IN=${IN:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/wav} +IN=${IN:-/dependency_untangling/wav} OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/mp3} -LOGS=${OUT}/logs -mkdir -p ${LOGS} +mkdir -p ${OUT} + pure_func(){ ffmpeg -y -i pipe:0 -f mp3 -ab 192000 pipe:1 2>/dev/null } - export -f pure_func -pkg_count=0 -for item in $(hdfs dfs -ls -C /for-loops/wav); +for item in $(hdfs dfs -ls -C $IN); do pkg_count=$((pkg_count + 1)); out="$OUT/$(basename $item).mp3" From 6f0f5f303a4959de6a506626f792c7caf77702d3 Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Mon, 6 Jun 2022 17:28:41 -0400 Subject: [PATCH 24/37] small changes to eval scripts --- evaluation/distr_benchmarks/analytics-mts/run.distr.sh | 4 ++-- .../distr_benchmarks/dependency_untangling/run.distr.sh | 4 ++-- evaluation/distr_benchmarks/nlp/run.distr.sh | 8 ++++---- evaluation/distr_benchmarks/oneliners/run.distr.sh | 4 ++-- evaluation/distr_benchmarks/unix50/run.distr.sh | 4 ++-- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/evaluation/distr_benchmarks/analytics-mts/run.distr.sh b/evaluation/distr_benchmarks/analytics-mts/run.distr.sh index 9426fcbd9..23c66af1d 100755 --- a/evaluation/distr_benchmarks/analytics-mts/run.distr.sh +++ b/evaluation/distr_benchmarks/analytics-mts/run.distr.sh @@ -15,7 +15,7 @@ analytics-mts_bash(){ mkdir -p "$outputs_dir" touch "$times_file" - cat "$times_file" > "$times_file".d + cat "$times_file" >> "$times_file".d echo executing MTS analytics $(date) | tee "$times_file" echo '' >> "$times_file" ## FIXME 5.sh is not working yet @@ -47,7 +47,7 @@ analytics-mts_pash(){ mkdir -p "$pash_logs_dir" touch "$times_file" - cat "$times_file" > "$times_file".d + cat "$times_file" >> "$times_file".d echo executing MTS analytics with pash $(date) | tee "$times_file" echo '' >> "$times_file" ## FIXME 5.sh is not working yet diff --git a/evaluation/distr_benchmarks/dependency_untangling/run.distr.sh b/evaluation/distr_benchmarks/dependency_untangling/run.distr.sh index 6cdaa9c84..29ab821cc 100755 --- a/evaluation/distr_benchmarks/dependency_untangling/run.distr.sh +++ b/evaluation/distr_benchmarks/dependency_untangling/run.distr.sh @@ -22,7 +22,7 @@ dependency_untangling_bash() { mkdir -p "$outputs_dir" touch "$times_file" - cat "$times_file" > "$times_file".d + cat "$times_file" >> "$times_file".d echo executing dependency_untangling $(date) | tee "$times_file" echo '' >> "$times_file" @@ -57,7 +57,7 @@ dependency_untangling_pash() { mkdir -p "$pash_logs_dir" touch "$times_file" - cat "$times_file" > "$times_file".d + cat "$times_file" >> "$times_file".d echo executing dependency_untangling with pash $(date) | tee "$times_file" echo '' >> "$times_file" diff --git a/evaluation/distr_benchmarks/nlp/run.distr.sh b/evaluation/distr_benchmarks/nlp/run.distr.sh index c1285f73a..a77c00346 100755 --- a/evaluation/distr_benchmarks/nlp/run.distr.sh +++ b/evaluation/distr_benchmarks/nlp/run.distr.sh @@ -35,7 +35,7 @@ names_scripts=( "words_no_vowels;6_3" ) -bash_nlp(){ +nlp_bash(){ outputs_dir="outputs" times_file="seq.res" outputs_suffix="seq.out" @@ -43,7 +43,7 @@ bash_nlp(){ mkdir -p "$outputs_dir" touch "$times_file" - cat "$times_file" > "$times_file".d + cat "$times_file" >> "$times_file".d echo executing Unix-for-nlp $(date) | tee "$times_file" echo '' >> "$times_file" @@ -76,7 +76,7 @@ nlp_pash(){ mkdir -p "$pash_logs_dir" touch "$times_file" - cat "$times_file" > "$times_file".d + cat "$times_file" >> "$times_file".d echo executing Unix-for-nlp with $prefix pash $(date) | tee "$times_file" echo '' >> "$times_file" @@ -99,7 +99,7 @@ nlp_pash(){ done } -bash_nlp +nlp_bash nlp_pash "$PASH_FLAGS" "par_no_du" diff --git a/evaluation/distr_benchmarks/oneliners/run.distr.sh b/evaluation/distr_benchmarks/oneliners/run.distr.sh index 6eeaf36ac..5305426f6 100755 --- a/evaluation/distr_benchmarks/oneliners/run.distr.sh +++ b/evaluation/distr_benchmarks/oneliners/run.distr.sh @@ -26,7 +26,7 @@ oneliners_bash() { mkdir -p "$outputs_dir" touch "$seq_times_file" - cat $seq_times_file > $seq_times_file.d + cat $seq_times_file >> $seq_times_file.d echo executing one-liners $(date) | tee "$seq_times_file" echo '' >> "$seq_times_file" @@ -65,7 +65,7 @@ oneliners_pash(){ mkdir -p "$pash_logs_dir" touch "$times_file" - cat $times_file > $times_file.d + cat $times_file >> $times_file.d echo executing one-liners with $prefix pash with data $rep $(date) | tee "$times_file" echo '' >> "$times_file" diff --git a/evaluation/distr_benchmarks/unix50/run.distr.sh b/evaluation/distr_benchmarks/unix50/run.distr.sh index 2526bbbe4..c4dd9149d 100755 --- a/evaluation/distr_benchmarks/unix50/run.distr.sh +++ b/evaluation/distr_benchmarks/unix50/run.distr.sh @@ -16,7 +16,7 @@ unix50_bash(){ mkdir -p "$outputs_dir" touch "$times_file" - cat "$times_file" > "$times_file".d + cat "$times_file" >> "$times_file".d echo executing Unix50 $(date) | tee "$times_file" echo '' >> "$times_file" @@ -49,7 +49,7 @@ unix50_pash(){ mkdir -p "$pash_logs_dir" touch "$times_file" - cat "$times_file" > "$times_file".d + cat "$times_file" >> "$times_file".d echo executing Unix50 $(date) | tee "$times_file" echo '' >> "$times_file" From 9d824998f8d03c6541084589d6275532166b7e90 Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Tue, 7 Jun 2022 13:20:21 +0000 Subject: [PATCH 25/37] improve oneliners eval and setup scripts --- .../analytics-mts/input/setup.sh | 44 +++++++------------ evaluation/distr_benchmarks/oneliners/diff.sh | 2 +- .../distr_benchmarks/oneliners/input/setup.sh | 32 +++++--------- .../distr_benchmarks/oneliners/nfa-regex.sh | 2 +- .../distr_benchmarks/oneliners/run.distr.sh | 23 ++++------ .../distr_benchmarks/oneliners/set-diff.sh | 2 +- .../oneliners/shortest-scripts.sh | 2 +- .../distr_benchmarks/oneliners/sort-sort.sh | 2 +- evaluation/distr_benchmarks/oneliners/sort.sh | 2 +- .../distr_benchmarks/oneliners/spell.sh | 2 +- .../distr_benchmarks/oneliners/top-n.sh | 2 +- 11 files changed, 44 insertions(+), 71 deletions(-) diff --git a/evaluation/distr_benchmarks/analytics-mts/input/setup.sh b/evaluation/distr_benchmarks/analytics-mts/input/setup.sh index 7dc7d067f..df6ce23ca 100755 --- a/evaluation/distr_benchmarks/analytics-mts/input/setup.sh +++ b/evaluation/distr_benchmarks/analytics-mts/input/setup.sh @@ -13,31 +13,21 @@ if [[ "$1" == "-c" ]]; then exit fi -setup_dataset() { - hdfs dfs -mkdir /analytics-mts - if [ ! -f ./in.csv ] && [ "$1" != "--small" ];; then - # yesterday=$(date --date='1 days ago' +'%y-%m-%d') - # curl https://www.balab.aueb.gr/~dds/oasa-$yesterday.bz2 | - curl -sf 'https://www.balab.aueb.gr/~dds/oasa-2021-01-08.bz2' | bzip2 -d > in.csv - if [ $? -ne 0 ]; then - echo "oasa-2021-01-08.bz2 / bzip2 not available, contact the pash authors" - exit 1 - fi - hdfs dfs -put in.csv /analytics-mts/in.csv - elif [ ! -f ./in_small.csv ] && [ "$1" = "--small" ]; then - if [ ! -f ./in_small.csv ]; then - echo "Generating small-size inputs" - # FIXME PR: Do we need all of them? - curl -sf 'http://pac-n4.csail.mit.edu:81/pash_data/small/in_small.csv' > in_small.csv - fi - hdfs dfs -put in_small.csv /analytics-mts/in_small.csv +hdfs dfs -mkdir /analytics-mts +if [ ! -f ./in.csv ] && [ "$1" != "--small" ]; then + # yesterday=$(date --date='1 days ago' +'%y-%m-%d') + # curl https://www.balab.aueb.gr/~dds/oasa-$yesterday.bz2 | + curl -sf 'https://www.balab.aueb.gr/~dds/oasa-2021-01-08.bz2' | bzip2 -d > in.csv + if [ $? -ne 0 ]; then + echo "oasa-2021-01-08.bz2 / bzip2 not available, contact the pash authors" + exit 1 fi -} - -source_var() { - if [[ "$1" == "--small" ]]; then - export IN="analytics-mts/in_small.csv" - else - export IN="analytics-mts/in.csv" - fi -} + hdfs dfs -put in.csv /analytics-mts/in.csv +elif [ ! -f ./in_small.csv ] && [ "$1" = "--small" ]; then + if [ ! -f ./in_small.csv ]; then + echo "Generating small-size inputs" + # FIXME PR: Do we need all of them? + curl -sf 'http://pac-n4.csail.mit.edu:81/pash_data/small/in_small.csv' > in_small.csv + fi + hdfs dfs -put in_small.csv /analytics-mts/in_small.csv +fi diff --git a/evaluation/distr_benchmarks/oneliners/diff.sh b/evaluation/distr_benchmarks/oneliners/diff.sh index 9435ad1d7..5b771e394 100755 --- a/evaluation/distr_benchmarks/oneliners/diff.sh +++ b/evaluation/distr_benchmarks/oneliners/diff.sh @@ -3,7 +3,7 @@ # Taken from https://crashingdaily.wordpress.com/2008/03/06/diff-two-stdout-streams/ # shuf() { awk 'BEGIN {srand(); OFMT="%.17f"} {print rand(), $0}' "$@" | sort -k1,1n | cut -d ' ' -f2-; } -IN=${IN:-/1G.txt} +IN=${IN:-/oneliners/1G.txt} mkfifo s1 s2 diff --git a/evaluation/distr_benchmarks/oneliners/input/setup.sh b/evaluation/distr_benchmarks/oneliners/input/setup.sh index 3d4921c22..a24725912 100755 --- a/evaluation/distr_benchmarks/oneliners/input/setup.sh +++ b/evaluation/distr_benchmarks/oneliners/input/setup.sh @@ -13,6 +13,8 @@ if [[ "$1" == "-c" ]]; then exit fi +hdfs dfs -mkdir /oneliners + if [ ! -f ./1M.txt ]; then curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt if [ $? -ne 0 ]; then @@ -67,30 +69,16 @@ if [ ! -f ./all_cmdsx100.txt ]; then done fi - -if [ "$#" -eq 1 ] && [ "$1" = "--full" ]; then - echo "Generating full-size inputs" - - - if [ ! -f ./3G.txt ]; then - touch 3G.txt - for (( i = 0; i < 3; i++ )); do - cat 1G.txt >> 3G.txt - done - fi - input_files+=("3G.txt") - - if [ ! -f ./10G.txt ]; then - touch 10G.txt - for (( i = 0; i < 10; i++ )); do - cat 1G.txt >> 10G.txt - done - fi - input_files+=("10G.txt") +if [ ! -f ./3G.txt ]; then + touch 3G.txt + for (( i = 0; i < 3; i++ )); do + cat 1G.txt >> 3G.txt + done fi +input_files+=("3G.txt") # Add files with different replication factors for file in "${input_files[@]}"; do - hdfs dfs -Ddfs.replication=1 -put $file /rep1_$file - hdfs dfs -Ddfs.replication=3 -put $file /rep3_$file + hdfs dfs -put $file /oneliners/$file + rm -f $file done \ No newline at end of file diff --git a/evaluation/distr_benchmarks/oneliners/nfa-regex.sh b/evaluation/distr_benchmarks/oneliners/nfa-regex.sh index 2a2c30718..2594da3eb 100755 --- a/evaluation/distr_benchmarks/oneliners/nfa-regex.sh +++ b/evaluation/distr_benchmarks/oneliners/nfa-regex.sh @@ -1,6 +1,6 @@ #!/bin/bash # Match complex regular-expression over input -IN=${IN:-/1G.txt} +IN=${IN:-/oneliners/1G.txt} hdfs dfs -cat $IN | tr A-Z a-z | grep '\(.\).*\1\(.\).*\2\(.\).*\3\(.\).*\4' diff --git a/evaluation/distr_benchmarks/oneliners/run.distr.sh b/evaluation/distr_benchmarks/oneliners/run.distr.sh index 5305426f6..95adff56b 100755 --- a/evaluation/distr_benchmarks/oneliners/run.distr.sh +++ b/evaluation/distr_benchmarks/oneliners/run.distr.sh @@ -19,9 +19,8 @@ scripts_inputs=( oneliners_bash() { outputs_dir="outputs" - rep=${1:-rep3} - seq_times_file=$rep"_seq.res" - seq_outputs_suffix=$rep"_seq.out" + seq_times_file="seq.res" + seq_outputs_suffix="seq.out" mkdir -p "$outputs_dir" @@ -36,7 +35,7 @@ oneliners_bash() { script="${script_input_parsed[0]}" input="${script_input_parsed[1]}" - export IN=/$rep\_$input + export IN="/oneliners/$input" export dict= printf -v pad %30s @@ -52,8 +51,7 @@ oneliners_bash() { oneliners_pash(){ flags=${1:-$PASH_FLAGS} prefix=${2:-par} - rep=${3:-rep3} - prefix=$prefix\_$rep + prefix=$prefix times_file="$prefix.res" outputs_suffix="$prefix.out" @@ -66,7 +64,7 @@ oneliners_pash(){ touch "$times_file" cat $times_file >> $times_file.d - echo executing one-liners with $prefix pash with data $rep $(date) | tee "$times_file" + echo executing one-liners with $prefix pash with data $(date) | tee "$times_file" echo '' >> "$times_file" for script_input in ${scripts_inputs[@]} @@ -75,7 +73,7 @@ oneliners_pash(){ script="${script_input_parsed[0]}" input="${script_input_parsed[1]}" - export IN=/$rep\_$input + export IN="/oneliners/$input" export dict= printf -v pad %30s @@ -92,11 +90,8 @@ oneliners_pash(){ done } -# oneliners_bash "rep1" -oneliners_bash "rep3" +oneliners_bash -# oneliners_pash "$PASH_FLAGS" "par" "rep1" -oneliners_pash "$PASH_FLAGS" "par" "rep3" +oneliners_pash "$PASH_FLAGS" "par" -# oneliners_pash "$PASH_FLAGS --distributed_exec" "distr" "rep1" -oneliners_pash "$PASH_FLAGS --distributed_exec" "distr" "rep3" +oneliners_pash "$PASH_FLAGS --distributed_exec" "distr" diff --git a/evaluation/distr_benchmarks/oneliners/set-diff.sh b/evaluation/distr_benchmarks/oneliners/set-diff.sh index 039e6996f..715488315 100755 --- a/evaluation/distr_benchmarks/oneliners/set-diff.sh +++ b/evaluation/distr_benchmarks/oneliners/set-diff.sh @@ -2,7 +2,7 @@ # Show the set-difference between two streams (i.e., elements in the first that are not in the second). # https://stackoverflow.com/questions/2509533/bash-linux-set-difference-between-two-text-files -IN=${IN:-/1G.txt} +IN=${IN:-/oneliners/1G.txt} mkfifo s1 s2 diff --git a/evaluation/distr_benchmarks/oneliners/shortest-scripts.sh b/evaluation/distr_benchmarks/oneliners/shortest-scripts.sh index 63a5bc3d9..b8999923b 100755 --- a/evaluation/distr_benchmarks/oneliners/shortest-scripts.sh +++ b/evaluation/distr_benchmarks/oneliners/shortest-scripts.sh @@ -6,6 +6,6 @@ # FIX: Input here should be a set of commands, more precisely, the ones on this specific machine. -IN=${IN:-/all_cmdsx100.txt} +IN=${IN:-/oneliners/all_cmdsx100.txt} hdfs dfs -cat $IN | xargs file | grep "shell script" | cut -d: -f1 | xargs -L 1 wc -l | grep -v '^0$' | sort -n | head -15 diff --git a/evaluation/distr_benchmarks/oneliners/sort-sort.sh b/evaluation/distr_benchmarks/oneliners/sort-sort.sh index 7b51ed889..16c372abc 100755 --- a/evaluation/distr_benchmarks/oneliners/sort-sort.sh +++ b/evaluation/distr_benchmarks/oneliners/sort-sort.sh @@ -1,6 +1,6 @@ #!/bin/bash # Calculate sort twice -IN=${IN:-/1G.txt} +IN=${IN:-/oneliners/1G.txt} hdfs dfs -cat $IN | tr A-Z a-z | sort | sort -r diff --git a/evaluation/distr_benchmarks/oneliners/sort.sh b/evaluation/distr_benchmarks/oneliners/sort.sh index 29cffa1cf..359701649 100755 --- a/evaluation/distr_benchmarks/oneliners/sort.sh +++ b/evaluation/distr_benchmarks/oneliners/sort.sh @@ -1,7 +1,7 @@ #!/bin/bash # Sort input -IN=${IN:-/1G.txt} +IN=${IN:-/oneliners/1G.txt} hdfs dfs -cat $IN | sort diff --git a/evaluation/distr_benchmarks/oneliners/spell.sh b/evaluation/distr_benchmarks/oneliners/spell.sh index 7928babe4..c8b2ddaa9 100755 --- a/evaluation/distr_benchmarks/oneliners/spell.sh +++ b/evaluation/distr_benchmarks/oneliners/spell.sh @@ -1,7 +1,7 @@ #!/bin/bash # Calculate mispelled words in an input # https://dl.acm.org/doi/10.1145/3532.315102 -IN=${IN:-/1G.txt} +IN=${IN:-/oneliners/1G.txt} dict=${dict:-$PASH_TOP/evaluation/distr_benchmarks/oneliners/input/dict.txt} hdfs dfs -cat $IN | diff --git a/evaluation/distr_benchmarks/oneliners/top-n.sh b/evaluation/distr_benchmarks/oneliners/top-n.sh index c2f7f2b21..ba2b4eb8d 100755 --- a/evaluation/distr_benchmarks/oneliners/top-n.sh +++ b/evaluation/distr_benchmarks/oneliners/top-n.sh @@ -2,7 +2,7 @@ # Top-N (1000) terms # from https://dl.acm.org/doi/10.1145/5948.315654 -IN=${IN:-/1G.txt} +IN=${IN:-/oneliners/1G.txt} hdfs dfs -cat $IN | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | tr A-Z a-z | sort | uniq -c | sort -rn | sed 100q From c6c2c387360bbf6910a87807ff44f72cfb84b445 Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Tue, 7 Jun 2022 21:24:24 +0000 Subject: [PATCH 26/37] fix du installation scripts --- .../input/install-deps.sh | 48 ++-- .../dependency_untangling/input/setup.sh | 250 +++++++++--------- .../distr_benchmarks/install_all_deps.sh | 3 + 3 files changed, 152 insertions(+), 149 deletions(-) create mode 100755 evaluation/distr_benchmarks/install_all_deps.sh diff --git a/evaluation/distr_benchmarks/dependency_untangling/input/install-deps.sh b/evaluation/distr_benchmarks/dependency_untangling/input/install-deps.sh index 3d4a75b1a..3bacbcaff 100755 --- a/evaluation/distr_benchmarks/dependency_untangling/input/install-deps.sh +++ b/evaluation/distr_benchmarks/dependency_untangling/input/install-deps.sh @@ -1,35 +1,41 @@ -IN=$PASH_TOP/evaluation/benchmarks/dependency_untangling/input/ +IN=$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/ mkdir -p ${IN}/deps/ # install dependencies -pkgs='ffmpeg unrtf imagemagick libarchive-tools zstd liblzma-dev libbz2-dev zip unzip nodejs tcpdump' +pkgs='ffmpeg unrtf imagemagick libarchive-tools libncurses5-dev libncursesw5-dev zstd liblzma-dev libbz2-dev zip unzip nodejs tcpdump' if ! dpkg -s $pkgs >/dev/null 2>&1 ; then sudo apt-get install $pkgs -y echo 'Packages Installed' fi -# NOT used -# if [ ! -d ${IN}/deps/samtools-1.7 ]; then -# cd ${IN}/deps/ -# wget https://github.com/samtools/samtools/archive/refs/tags/1.7.zip -# unzip 1.7.zip -# rm 1.7.zip -# cd samtools-1.7 -# wget https://github.com/samtools/htslib/archive/refs/tags/1.7.zip -# unzip 1.7.zip -# autoheader # Build config.h.in (this may generate a warning about -# # AC_CONFIG_SUBDIRS - please ignore it). -# autoconf -Wno-syntax # Generate the configure script -# ./configure # Needed for choosing optional functionality -# make -# rm -rf 1.7.zip -# echo 'Samtools installed' -# fi +if [ ! -d ${IN}/deps/samtools-1.7 ]; then + cd ${IN}/deps/ + wget https://github.com/samtools/samtools/archive/refs/tags/1.7.zip + unzip 1.7.zip + rm 1.7.zip + cd samtools-1.7 + wget https://github.com/samtools/htslib/archive/refs/tags/1.7.zip + unzip 1.7.zip + autoheader # Build config.h.in (this may generate a warning about + # AC_CONFIG_SUBDIRS - please ignore it). + autoconf -Wno-syntax # Generate the configure script + ./configure # Needed for choosing optional functionality + make + rm -rf 1.7.zip + echo 'Samtools installed' +fi -# if [ ! -f ${IN}/deps/makedeb.deb ]; then +if [ ! -d ${IN}/mir-sa ]; then + # download the specific mir version + cd ${IN} + wget http://pac-n4.csail.mit.edu:81/pash_data/mir-sa.zip + unzip mir-sa.zip + rm mir-sa.zip +fi + +# if ! dpkg -s "makedeb-makepkg" >/dev/null 2>&1 ; then # cd ${IN}/deps/ # wget http://pac-n4.csail.mit.edu:81/pash_data/makedeb.deb # sudo dpkg -i makedeb.deb # echo 'Makedeb installed' # fi - diff --git a/evaluation/distr_benchmarks/dependency_untangling/input/setup.sh b/evaluation/distr_benchmarks/dependency_untangling/input/setup.sh index d3baf70ca..8c147e49c 100755 --- a/evaluation/distr_benchmarks/dependency_untangling/input/setup.sh +++ b/evaluation/distr_benchmarks/dependency_untangling/input/setup.sh @@ -24,142 +24,136 @@ if [ "$1" == "-c" ]; then exit fi -setup_dataset() { - hdfs dfs -mkdir /dependency_untangling +hdfs dfs -mkdir /dependency_untangling - if [ "$1" == "--small" ]; then - LOG_DATA_FILES=6 - WAV_DATA_FILES=20 - NODE_MODULE_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/node_modules.zip - BIO_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/bio.zip - JPG_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/jpg.zip - PCAP_DATA_FILES=1 - else - LOG_DATA_FILES=84 - WAV_DATA_FILES=120 - NODE_MODULE_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/node_modules.zip - BIO_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/bio.zip - JPG_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/jpg.zip - PCAP_DATA_FILES=15 - fi +if [ "$1" == "--small" ]; then + LOG_DATA_FILES=6 + WAV_DATA_FILES=20 + NODE_MODULE_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/node_modules.zip + BIO_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/bio.zip + JPG_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/small/jpg.zip + PCAP_DATA_FILES=1 +else + LOG_DATA_FILES=84 + WAV_DATA_FILES=120 + NODE_MODULE_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/node_modules.zip + BIO_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/bio.zip + JPG_DATA_LINK=http://pac-n4.csail.mit.edu:81/pash_data/full/jpg.zip + PCAP_DATA_FILES=15 +fi - if [ ! -d ${IN}/wav ]; then - wget http://pac-n4.csail.mit.edu:81/pash_data/wav.zip - unzip wav.zip && cd wav/ - for f in *.wav; do - FILE=$(basename "$f") - for (( i = 0; i <= $WAV_DATA_FILES; i++)) do - echo copying to $f$i.wav - cp $f $f$i.wav - done +if [ ! -d ${IN}/wav ]; then + wget http://pac-n4.csail.mit.edu:81/pash_data/wav.zip + unzip wav.zip && cd wav/ + for f in *.wav; do + FILE=$(basename "$f") + for (( i = 0; i <= $WAV_DATA_FILES; i++)) do + echo copying to $f$i.wav + cp $f $f$i.wav done - cd .. - hdfs dfs -put wav /dependency_untangling/wav - echo "WAV Generated" - fi + done + cd .. + hdfs dfs -put wav /dependency_untangling/wav + echo "WAV Generated" +fi - if [ ! -d ${IN}/jpg ]; then - cd ${IN} - wget $JPG_DATA_LINK - unzip jpg.zip - hdfs dfs -put jpg /dependency_untangling/jpg - echo "JPG Generated" - rm -rf ${IN}/jpg.zip - fi +if [ ! -d ${IN}/jpg ]; then + cd ${IN} + wget $JPG_DATA_LINK + unzip jpg.zip + hdfs dfs -put jpg /dependency_untangling/jpg + echo "JPG Generated" + rm -rf ${IN}/jpg.zip +fi - # download the input for the nginx logs and populate the dataset - if [ ! -d ${IN}/log_data ]; then - cd $IN - wget http://pac-n4.csail.mit.edu:81/pash_data/nginx.zip - unzip nginx.zip - rm nginx.zip - # generating analysis logs - mkdir -p ${IN}/log_data - for (( i = 1; i <=$LOG_DATA_FILES; i++)) do - for j in nginx-logs/*;do - n=$(basename $j) - cat $j > log_data/log${i}_${n}.log; - done +# download the input for the nginx logs and populate the dataset +if [ ! -d ${IN}/log_data ]; then + cd $IN + wget http://pac-n4.csail.mit.edu:81/pash_data/nginx.zip + unzip nginx.zip + rm nginx.zip + # generating analysis logs + mkdir -p ${IN}/log_data + for (( i = 1; i <=$LOG_DATA_FILES; i++)) do + for j in nginx-logs/*;do + n=$(basename $j) + cat $j > log_data/log${i}_${n}.log; done - hdfs dfs -put log_data /dependency_untangling/log_data - echo "Logs Generated" + done + hdfs dfs -put log_data /dependency_untangling/log_data + echo "Logs Generated" +fi + +if [ ! -d ${IN}/bio ]; then + if [ "$1" = "--small" ]; then + # download the Genome loc file + wget $BIO_DATA_LINK + unzip bio.zip + cd bio + wget http://pac-n4.csail.mit.edu:81/pash_data/Gene_locs.txt + wget http://pac-n4.csail.mit.edu:81/pash_data/small/100G.txt + cd .. + rm bio.zip + else + mkdir ${IN}/bio + cd ${IN}/bio + # download the file containing the links for the dataset + wget http://pac-n4.csail.mit.edu:81/pash_data/100G.txt + # download the Genome loc file + wget http://pac-n4.csail.mit.edu:81/pash_data/Gene_locs.txt + # start downloading the real dataset + IN_NAME=$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/bio/100G.txt + cat ${IN_NAME} | while read s_line; + do + echo ${IN_NAME} + sample=$(echo $s_line |cut -d " " -f 2); + if [[ ! -f $sample ]]; then + pop=$(echo $s_line |cut -f 1 -d " "); + link=$(echo $s_line |cut -f 3 -d " "); + wget -O "$sample".bam "$link"; ##this part can be adjusted maybe + fi + done; + cd .. fi + hdfs dfs -put bio /dependency_untangling/bio + echo "Genome data downloaded" +fi - # if [ ! -d ${IN}/bio ]; then - # if [ "$1" = "--small" ]; then - # # download the Genome loc file - # wget $BIO_DATA_LINK - # unzip bio.zip - # cd bio - # wget http://pac-n4.csail.mit.edu:81/pash_data/Gene_locs.txt - # wget http://pac-n4.csail.mit.edu:81/pash_data/small/100G.txt - # cd .. - # rm bio.zip - # else - # mkdir ${IN}/bio - # cd ${IN}/bio - # # download the file containing the links for the dataset - # wget http://pac-n4.csail.mit.edu:81/pash_data/100G.txt - # # download the Genome loc file - # wget http://pac-n4.csail.mit.edu:81/pash_data/Gene_locs.txt - # # start downloading the real dataset - # cat ${IN_NAME} |while read s_line; - # do - # echo ${IN_NAME} - # sample=$(echo $s_line |cut -d " " -f 2); - # if [[ ! -f $sample ]]; then - # pop=$(echo $s_line |cut -f 1 -d " "); - # link=$(echo $s_line |cut -f 3 -d " "); - # wget -O "$sample".bam "$link"; ##this part can be adjusted maybe - # fi - # done; - # fi - # echo "Genome data downloaded" - # fi - - # download the initial pcaps to populate the whole dataset - if [ ! -d ${IN}/pcap_data ]; then - cd $IN - wget http://pac-n4.csail.mit.edu:81/pash_data/pcaps.zip - unzip pcaps.zip - rm pcaps.zip - mkdir ${IN}/pcap_data/ - # generates 20G - for (( i = 1; i <= $PCAP_DATA_FILES; i++ )) do - for j in ${IN}/pcaps/*;do - n=$(basename $j) - cat $j > pcap_data/pcap${i}_${n}; - done +# download the initial pcaps to populate the whole dataset +if [ ! -d ${IN}/pcap_data ]; then + cd $IN + wget http://pac-n4.csail.mit.edu:81/pash_data/pcaps.zip + unzip pcaps.zip + rm pcaps.zip + mkdir ${IN}/pcap_data/ + # generates 20G + for (( i = 1; i <= $PCAP_DATA_FILES; i++ )) do + for j in ${IN}/pcaps/*;do + n=$(basename $j) + cat $j > pcap_data/pcap${i}_${n}; done - hdfs dfs -put pcap_data /dependency_untangling/pcap_data - echo "Pcaps Generated" - fi - - # # download the modules for the Mir static analyses - # if [ ! -d ${IN}/node_modules ]; then - # cd $IN - # wget $NODE_MODULE_LINK - # unzip node_modules.zip - # rm node_modules.zip - # # download the specific mir version - # wget http://pac-n4.csail.mit.edu:81/pash_data/mir-sa.zip - # unzip mir-sa.zip - # rm mir-sa.zip - # echo "Node modules generated" - # fi + done + hdfs dfs -put pcap_data /dependency_untangling/pcap_data + echo "Pcaps Generated" +fi - # # download the packages for the package building - # if [ ! -f ${IN}/packages ]; then - # cd $IN - # wget http://pac-n4.csail.mit.edu:81/pash_data/packages - # if [ "$1" = "--small" ]; then - # head -n 20 packages > p - # mv p packages - # fi - # echo "Package datset downloaded" - # fi -} +# download the modules for the Mir static analyses +if [ ! -d ${IN}/node_modules ]; then + cd $IN + wget $NODE_MODULE_LINK + unzip node_modules.zip + rm node_modules.zip + hdfs dfs -put node_modules /dependency_untangling/node_modules + echo "Node modules generated" +fi -source_var() { - export IN= -} +# # download the packages for the package building +# if [ ! -f ${IN}/packages ]; then +# cd $IN +# wget http://pac-n4.csail.mit.edu:81/pash_data/packages +# if [ "$1" = "--small" ]; then +# head -n 20 packages > p +# mv p packages +# fi +# echo "Package datset downloaded" +# fi diff --git a/evaluation/distr_benchmarks/install_all_deps.sh b/evaluation/distr_benchmarks/install_all_deps.sh new file mode 100755 index 000000000..3bf174252 --- /dev/null +++ b/evaluation/distr_benchmarks/install_all_deps.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +bash ./dependency_untangling/input/install-deps.sh \ No newline at end of file From a056c467a47fa1f2cfe59bc433af3a6f5504219b Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Tue, 7 Jun 2022 21:25:19 +0000 Subject: [PATCH 27/37] Add newly added benchmarks to the run all script --- evaluation/distr_benchmarks/run_all.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/evaluation/distr_benchmarks/run_all.sh b/evaluation/distr_benchmarks/run_all.sh index 9162bd352..fdd89e2f9 100755 --- a/evaluation/distr_benchmarks/run_all.sh +++ b/evaluation/distr_benchmarks/run_all.sh @@ -1,5 +1,14 @@ cd $PASH_TOP/evaluation/distr_benchmarks/oneliners bash run.distr.sh +cd $PASH_TOP/evaluation/distr_benchmarks/unix50 +bash run.distr.sh + cd $PASH_TOP/evaluation/distr_benchmarks/nlp +bash run.distr.sh + +cd $PASH_TOP/evaluation/distr_benchmarks/analytics-mts +bash run.distr.sh + +cd $PASH_TOP/evaluation/distr_benchmarks/dependency_untangling bash run.distr.sh \ No newline at end of file From 9ee0791077b0de94683c83f6f542f3b5598048f0 Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Wed, 8 Jun 2022 19:51:05 +0000 Subject: [PATCH 28/37] use gzip instead of zip for better streaming support --- .../dependency_untangling/compress_files.sh | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/evaluation/distr_benchmarks/dependency_untangling/compress_files.sh b/evaluation/distr_benchmarks/dependency_untangling/compress_files.sh index d7c331e84..63f405099 100755 --- a/evaluation/distr_benchmarks/dependency_untangling/compress_files.sh +++ b/evaluation/distr_benchmarks/dependency_untangling/compress_files.sh @@ -4,16 +4,11 @@ IN=${IN:-/dependency_untangling/pcap_data/} OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/dependency_untangling/input/output/compress} mkdir -p ${OUT} -pure_func() { - zip -r -- -} - -export -f pure_func for item in $(hdfs dfs -ls -C ${IN}); do output_name=$(basename $item).zip - hdfs dfs -cat $item | pure_func > $OUT/$output_name + hdfs dfs -cat $item | gzip -c > $OUT/$output_name done echo 'done'; From 63fe6145111c13c3c0c1821d6b261f5db8a18883 Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Wed, 8 Jun 2022 19:55:04 +0000 Subject: [PATCH 29/37] small changes to setup script --- .../distr_benchmarks/nlp/input/setup.sh | 55 ++++++-------- .../distr_benchmarks/unix50/input/setup.sh | 75 +++++++++---------- 2 files changed, 60 insertions(+), 70 deletions(-) diff --git a/evaluation/distr_benchmarks/nlp/input/setup.sh b/evaluation/distr_benchmarks/nlp/input/setup.sh index 1875bbb8a..48bdde472 100755 --- a/evaluation/distr_benchmarks/nlp/input/setup.sh +++ b/evaluation/distr_benchmarks/nlp/input/setup.sh @@ -4,21 +4,20 @@ PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)} [[ "$1" == "-c" ]] && { rm -rf genesis exodus pg; exit; } -setup_dataset() { - if [ ! -f ./genesis ]; then - curl -sf https://www.gutenberg.org/cache/epub/8001/pg8001.txt > genesis - "$PASH_TOP/scripts/append_nl_if_not.sh" genesis - fi +if [ ! -f ./genesis ]; then + curl -sf https://www.gutenberg.org/cache/epub/8001/pg8001.txt > genesis + "$PASH_TOP/scripts/append_nl_if_not.sh" genesis +fi - if [ ! -f ./exodus ]; then - curl -sf https://www.gutenberg.org/files/33420/33420-0.txt > exodus - "$PASH_TOP/scripts/append_nl_if_not.sh" exodus - fi +if [ ! -f ./exodus ]; then + curl -sf https://www.gutenberg.org/files/33420/33420-0.txt > exodus + "$PASH_TOP/scripts/append_nl_if_not.sh" exodus +fi - if [ ! -e ./pg ]; then - mkdir pg - cd pg - if [[ "$1" == "--gen-full" ]]; then +if [ ! -e ./pg ]; then + mkdir pg + cd pg + if [[ "$1" == "--full" ]]; then echo 'N.b.: download/extraction will take about 10min' wget ndr.md/data/pg.tar.xz if [ $? -ne 0 ]; then @@ -36,24 +35,16 @@ setup_dataset() { mv data/* . rm nlp.zip data -rf fi - for f in *.txt; do - "$PASH_TOP/scripts/append_nl_if_not.sh" $f - done - cd .. - fi - # Put files in hdfs - hdfs dfs -mkdir /nlp - hdfs dfs -put exodus /nlp/exodus - hdfs dfs -put genesis /nlp/genesis - hdfs dfs -put pg /nlp/pg -} + for f in *.txt; do + "$PASH_TOP/scripts/append_nl_if_not.sh" $f + done + cd .. + +fi -source_var() { - if [[ "$1" == "--small" ]]; then - export ENTRIES=40 - else - # 1% of the input - export ENTRIES=1060 - fi -} +# Put files in hdfs +hdfs dfs -mkdir /nlp +hdfs dfs -put exodus /nlp/exodus +hdfs dfs -put genesis /nlp/genesis +hdfs dfs -put pg /nlp/pg diff --git a/evaluation/distr_benchmarks/unix50/input/setup.sh b/evaluation/distr_benchmarks/unix50/input/setup.sh index 68b831d82..4a7c37dec 100755 --- a/evaluation/distr_benchmarks/unix50/input/setup.sh +++ b/evaluation/distr_benchmarks/unix50/input/setup.sh @@ -19,46 +19,45 @@ if [[ "$1" == "-c" ]]; then exit fi -setup_dataset() { - # Put files in hdfs - hdfs dfs -mkdir /unix50 - - # generate small inputs - # if [ "$#" -eq 1 ] && [ "$1" = "--small" ]; then - # if [ ! -d ./small ]; then - # echo "Generating small-size inputs" - # # FIXME PR: Do we need all of them? - # curl -sf 'http://pac-n4.csail.mit.edu:81/pash_data/small/unix50.zip' > unix50.zip - # unzip unix50.zip - # rm -f unix50.zip - # fi - # hdfs dfs -put small /unix50/small - # return 0 - # fi - - for input in ${inputs[@]} - do - if [ ! -f "${input}.txt" ]; then - wget "http://ndr.md/data/unix50/${input}.txt" - "$PASH_TOP/scripts/append_nl_if_not.sh" "${input}.txt" - fi - hdfs dfs -put "${input}.txt" /unix50/"${input}.txt" - done +# Put files in hdfs +hdfs dfs -mkdir /unix50 - # increase the original input size 10x - if [ "$#" -eq 1 ] && [ "$1" = "--extended" ]; then - EXTENDED_INPUT_DIR="extended_input/" - mkdir -p $EXTENDED_INPUT_DIR - for file in *.txt; do - rm $EXTENDED_INPUT_DIR/$file - for (( i = 0; i < 10; i++ )); do - cat $file >> $EXTENDED_INPUT_DIR/temp.txt - done - done - hdfs dfs -put $EXTENDED_INPUT_DIR /unix50/$EXTENDED_INPUT_DIR - rm -rf $EXTENDED_INPUT_DIR +# generate small inputs +# if [ "$#" -eq 1 ] && [ "$1" = "--small" ]; then +# if [ ! -d ./small ]; then +# echo "Generating small-size inputs" +# # FIXME PR: Do we need all of them? +# curl -sf 'http://pac-n4.csail.mit.edu:81/pash_data/small/unix50.zip' > unix50.zip +# unzip unix50.zip +# rm -f unix50.zip +# fi +# hdfs dfs -put small /unix50/small +# return 0 +# fi + +for input in ${inputs[@]} +do + if [ ! -f "${input}.txt" ]; then + wget "http://ndr.md/data/unix50/${input}.txt" + "$PASH_TOP/scripts/append_nl_if_not.sh" "${input}.txt" fi -} + hdfs dfs -put "${input}.txt" /unix50/"${input}.txt" +done + +# increase the original input size 10x +if [ "$#" -eq 1 ] && [ "$1" = "--extended" ]; then + EXTENDED_INPUT_DIR="extended_input/" + mkdir -p $EXTENDED_INPUT_DIR + for file in *.txt; do + rm $EXTENDED_INPUT_DIR/$file + for (( i = 0; i < 10; i++ )); do + cat $file >> $EXTENDED_INPUT_DIR/temp.txt + done + done + hdfs dfs -put $EXTENDED_INPUT_DIR /unix50/$EXTENDED_INPUT_DIR + rm -rf $EXTENDED_INPUT_DIR +fi + source_var() { if [[ "$1" == "--extended" ]]; then From badea8dc442d485fa8df32c0167d3f43aeff6b31 Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Wed, 8 Jun 2022 19:55:23 +0000 Subject: [PATCH 30/37] fix bug in pcap.sh --- evaluation/distr_benchmarks/dependency_untangling/pcap.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evaluation/distr_benchmarks/dependency_untangling/pcap.sh b/evaluation/distr_benchmarks/dependency_untangling/pcap.sh index 13a0cd29e..237ab092a 100755 --- a/evaluation/distr_benchmarks/dependency_untangling/pcap.sh +++ b/evaluation/distr_benchmarks/dependency_untangling/pcap.sh @@ -6,8 +6,8 @@ mkdir -p $OUT pure_func() { tempfile=$(mktemp) - - tee $tempfile | tcpdump -nn -r '-' -A 'port 53' 2> /dev/null | sort | uniq |grep -Ev '(com|net|org|gov|mil|arpa)' 2> /dev/null + cat > $tempfile + tcpdump -nn -r $tempfile -A 'port 53' 2> /dev/null | sort | uniq |grep -Ev '(com|net|org|gov|mil|arpa)' 2> /dev/null # extract URL tcpdump -nn -r $tempfile -s 0 -v -n -l 2> /dev/null | egrep -i "POST /|GET /|Host:" 2> /dev/null # extract passwords From bb3f2e01c2c44efe1e4b2210071ce4c0ec7526c0 Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Sat, 11 Jun 2022 18:32:33 +0000 Subject: [PATCH 31/37] Add max-temp benchmark --- .../distr_benchmarks/max-temp/input/setup.sh | 31 +++++++++++ .../distr_benchmarks/max-temp/run.distr.sh | 53 +++++++++++++++++++ .../max-temp/temp-analytics.sh | 20 +++++++ 3 files changed, 104 insertions(+) create mode 100755 evaluation/distr_benchmarks/max-temp/input/setup.sh create mode 100755 evaluation/distr_benchmarks/max-temp/run.distr.sh create mode 100755 evaluation/distr_benchmarks/max-temp/temp-analytics.sh diff --git a/evaluation/distr_benchmarks/max-temp/input/setup.sh b/evaluation/distr_benchmarks/max-temp/input/setup.sh new file mode 100755 index 000000000..ffb5f47b4 --- /dev/null +++ b/evaluation/distr_benchmarks/max-temp/input/setup.sh @@ -0,0 +1,31 @@ +#!/bin/bash +FROM=${FROM:-2015} +TO=${TO:-2015} +IN=${IN:-'http://ndr.md/data/noaa/'} +fetch=${fetch:-"curl -s"} + +data_file=temperatures.txt + +if [[ "$1" == "--extended" ]]; then + echo "Downloading extended input" + dataset_size=14418 +else + dataset_size=1442 +fi + +## Downloading and extracting +seq $FROM $TO | + sed "s;^;$IN;" | + sed 's;$;/;' | + xargs -r -n 1 $fetch | + grep gz | + tr -s ' \n' | + cut -d ' ' -f9 | + sed 's;^\(.*\)\(20[0-9][0-9]\).gz;\2/\1\2\.gz;' | + sed "s;^;$IN;" | + head -n $dataset_size | + xargs -n1 $fetch | + gunzip > "${data_file}" + +hdfs dfs -mkdir /max-temp +hdfs dfs -put "${data_file}" /max-temp/"${data_file}" \ No newline at end of file diff --git a/evaluation/distr_benchmarks/max-temp/run.distr.sh b/evaluation/distr_benchmarks/max-temp/run.distr.sh new file mode 100755 index 000000000..3114fb3b9 --- /dev/null +++ b/evaluation/distr_benchmarks/max-temp/run.distr.sh @@ -0,0 +1,53 @@ +PASH_FLAGS='--width 8 --r_split' +export TIMEFORMAT=%R + +export IN="/max-temp/temperatures.txt" + +max-temp_bash(){ + times_file="seq.res" + outputs_suffix="seq.out" + outputs_dir="outputs" + + mkdir -p "$outputs_dir" + touch "$times_file" + echo executing max temp $(date) | tee -a "$times_file" + outputs_file="${outputs_dir}/temp-analytics.${outputs_suffix}" + echo "max-temp.sh: " $({ time ./temp-analytics.sh > "${outputs_file}"; } 2>&1) | tee -a "$times_file" +} + +max-temp_pash(){ + flags=${1:-$PASH_FLAGS} + prefix=${2:-par} + + times_file="$prefix.res" + outputs_suffix="$prefix.out" + time_suffix="$prefix.time" + outputs_dir="outputs" + pash_logs_dir="pash_logs_$prefix" + + mkdir -p "$outputs_dir" + mkdir -p "$pash_logs_dir" + + touch "$times_file" + cat "$times_file" >> "$times_file".d + echo executing max-temp with $prefix pash $(date) | tee "$times_file" + echo '' >> "$times_file" + + outputs_file="${outputs_dir}/temp-analytics.${outputs_suffix}" + pash_log="${pash_logs_dir}/temp-analytics.pash.log" + single_time_file="${outputs_dir}/temp-analytics.${time_suffix}" + + echo -n "temp-analytics.sh: " | tee -a "$times_file" + { time "$PASH_TOP/pa.sh" $flags --log_file "${pash_log}" temp-analytics.sh > "$outputs_file"; } 2> "${single_time_file}" + cat "${single_time_file}" | tee -a "$times_file" +} + +max-temp_bash + +max-temp_pash "$PASH_FLAGS" "par_no_du" + +max-temp_pash "$PASH_FLAGS --parallel_pipelines --parallel_pipelines_limit 24" "par" + +max-temp_pash "$PASH_FLAGS --distributed_exec" "distr_no_du" + +max-temp_pash "$PASH_FLAGS --parallel_pipelines --distributed_exec --parallel_pipelines_limit 24" "distr" diff --git a/evaluation/distr_benchmarks/max-temp/temp-analytics.sh b/evaluation/distr_benchmarks/max-temp/temp-analytics.sh new file mode 100755 index 000000000..8ab2113d8 --- /dev/null +++ b/evaluation/distr_benchmarks/max-temp/temp-analytics.sh @@ -0,0 +1,20 @@ +#!/bin/bash +IN=${IN:-/max-temp/temperatures.txt} + +## Processing +hdfs dfs -cat "${IN}" | + cut -c 89-92 | + grep -v 999 | + sort -rn | + head -n1 > max.txt + +hdfs dfs -cat "${IN}" | + cut -c 89-92 | + grep -v 999 | + sort -n | + head -n1 > min.txt + +hdfs dfs -cat "${IN}" | + cut -c 89-92 | + grep -v 999 | + awk "{ total += \$1; count++ } END { print total/count }" > average.txt From 8124626dd7c743d86d25ec99bcf68d53b904cfb9 Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Sat, 11 Jun 2022 18:35:08 +0000 Subject: [PATCH 32/37] fix typo --- evaluation/distr_benchmarks/max-temp/run.distr.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evaluation/distr_benchmarks/max-temp/run.distr.sh b/evaluation/distr_benchmarks/max-temp/run.distr.sh index 3114fb3b9..bdfb10943 100755 --- a/evaluation/distr_benchmarks/max-temp/run.distr.sh +++ b/evaluation/distr_benchmarks/max-temp/run.distr.sh @@ -12,7 +12,7 @@ max-temp_bash(){ touch "$times_file" echo executing max temp $(date) | tee -a "$times_file" outputs_file="${outputs_dir}/temp-analytics.${outputs_suffix}" - echo "max-temp.sh: " $({ time ./temp-analytics.sh > "${outputs_file}"; } 2>&1) | tee -a "$times_file" + echo "temp-analytics.sh: " $({ time ./temp-analytics.sh > "${outputs_file}"; } 2>&1) | tee -a "$times_file" } max-temp_pash(){ From 732c78c4bfe4b37474be684d338a26d83ed3d019 Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Sat, 11 Jun 2022 18:45:21 +0000 Subject: [PATCH 33/37] fixes to eval scripts --- .../distr_benchmarks/nlp/input/setup.sh | 29 +++++++++++++++++-- evaluation/distr_benchmarks/nlp/run.distr.sh | 12 ++++---- .../distr_benchmarks/oneliners/run.distr.sh | 4 +-- evaluation/distr_benchmarks/run_all.sh | 5 +++- .../distr_benchmarks/unix50/run.distr.sh | 15 ++++++---- 5 files changed, 49 insertions(+), 16 deletions(-) diff --git a/evaluation/distr_benchmarks/nlp/input/setup.sh b/evaluation/distr_benchmarks/nlp/input/setup.sh index 48bdde472..2f2e7462b 100755 --- a/evaluation/distr_benchmarks/nlp/input/setup.sh +++ b/evaluation/distr_benchmarks/nlp/input/setup.sh @@ -5,9 +5,15 @@ PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)} [[ "$1" == "-c" ]] && { rm -rf genesis exodus pg; exit; } if [ ! -f ./genesis ]; then +<<<<<<< Updated upstream curl -sf https://www.gutenberg.org/cache/epub/8001/pg8001.txt > genesis "$PASH_TOP/scripts/append_nl_if_not.sh" genesis fi +======= + curl -sf https://www.gutenberg.org/cache/epub/8001/pg8001.txt > genesis + "$PASH_TOP/scripts/append_nl_if_not.sh" genesis + fi +>>>>>>> Stashed changes if [ ! -f ./exodus ]; then curl -sf https://www.gutenberg.org/files/33420/33420-0.txt > exodus @@ -17,7 +23,11 @@ fi if [ ! -e ./pg ]; then mkdir pg cd pg +<<<<<<< Updated upstream if [[ "$1" == "--full" ]]; then +======= + if [[ "$1" == "--gen-full" ]]; then +>>>>>>> Stashed changes echo 'N.b.: download/extraction will take about 10min' wget ndr.md/data/pg.tar.xz if [ $? -ne 0 ]; then @@ -27,14 +37,16 @@ if [ ! -e ./pg ]; then please contact the pash developers pash-devs@googlegroups.com EOF exit 1 - fi - cat pg.tar.xz | tar -xJ + fi + cat pg.tar.xz | tar -xJ + else wget http://pac-n4.csail.mit.edu:81/pash_data/nlp.zip unzip nlp.zip mv data/* . rm nlp.zip data -rf fi +<<<<<<< Updated upstream for f in *.txt; do "$PASH_TOP/scripts/append_nl_if_not.sh" $f @@ -48,3 +60,16 @@ hdfs dfs -mkdir /nlp hdfs dfs -put exodus /nlp/exodus hdfs dfs -put genesis /nlp/genesis hdfs dfs -put pg /nlp/pg +======= +for f in *.txt; do + "$PASH_TOP/scripts/append_nl_if_not.sh" $f +done + cd .. +fi + + # Put files in hdfs + hdfs dfs -mkdir /nlp + hdfs dfs -put exodus /nlp/exodus + hdfs dfs -put genesis /nlp/genesis + hdfs dfs -put pg /nlp/pg +>>>>>>> Stashed changes diff --git a/evaluation/distr_benchmarks/nlp/run.distr.sh b/evaluation/distr_benchmarks/nlp/run.distr.sh index a77c00346..8c0488714 100755 --- a/evaluation/distr_benchmarks/nlp/run.distr.sh +++ b/evaluation/distr_benchmarks/nlp/run.distr.sh @@ -52,9 +52,9 @@ nlp_bash(){ IFS=";" read -r -a name_script_parsed <<< "${name_script}" name="${name_script_parsed[0]}" script="${name_script_parsed[1]}" - printf -v pad %30s - padded_script="${name}.sh:${pad}" - padded_script=${padded_script:0:30} + printf -v pad %40s + padded_script="${name}.sh: ${pad}" + padded_script=${padded_script:0:40} outputs_file="${outputs_dir}/${script}.${outputs_suffix}" @@ -85,9 +85,9 @@ nlp_pash(){ IFS=";" read -r -a name_script_parsed <<< "${name_script}" name="${name_script_parsed[0]}" script="${name_script_parsed[1]}" - printf -v pad %30s - padded_script="${name}.sh:${pad}" - padded_script=${padded_script:0:30} + printf -v pad %40s + padded_script="${name}.sh: ${pad}" + padded_script=${padded_script:0:40} outputs_file="${outputs_dir}/${script}.${outputs_suffix}" pash_log="${pash_logs_dir}/${script}.pash.log" diff --git a/evaluation/distr_benchmarks/oneliners/run.distr.sh b/evaluation/distr_benchmarks/oneliners/run.distr.sh index 95adff56b..948a61b48 100755 --- a/evaluation/distr_benchmarks/oneliners/run.distr.sh +++ b/evaluation/distr_benchmarks/oneliners/run.distr.sh @@ -39,7 +39,7 @@ oneliners_bash() { export dict= printf -v pad %30s - padded_script="${script}${pad}" + padded_script="${script}.sh:${pad}" padded_script=${padded_script:0:30} seq_outputs_file="${outputs_dir}/${script}.${seq_outputs_suffix}" @@ -77,7 +77,7 @@ oneliners_pash(){ export dict= printf -v pad %30s - padded_script="${script}${pad}" + padded_script="${script}.sh:${pad}" padded_script=${padded_script:0:30} outputs_file="${outputs_dir}/${script}.${outputs_suffix}" diff --git a/evaluation/distr_benchmarks/run_all.sh b/evaluation/distr_benchmarks/run_all.sh index fdd89e2f9..4e5934595 100755 --- a/evaluation/distr_benchmarks/run_all.sh +++ b/evaluation/distr_benchmarks/run_all.sh @@ -11,4 +11,7 @@ cd $PASH_TOP/evaluation/distr_benchmarks/analytics-mts bash run.distr.sh cd $PASH_TOP/evaluation/distr_benchmarks/dependency_untangling -bash run.distr.sh \ No newline at end of file +bash run.distr.sh + +cd $PASH_TOP/evaluation/distr_benchmarks/max-temp +bash run.distr.sh diff --git a/evaluation/distr_benchmarks/unix50/run.distr.sh b/evaluation/distr_benchmarks/unix50/run.distr.sh index c4dd9149d..1e10f8b6c 100755 --- a/evaluation/distr_benchmarks/unix50/run.distr.sh +++ b/evaluation/distr_benchmarks/unix50/run.distr.sh @@ -1,6 +1,11 @@ PASH_FLAGS='--width 8 --r_split' export TIMEFORMAT=%R - +names_scripts=( + 1 2 3 4 5 6 7 8 9 10 + 11 12 13 14 15 16 17 18 19 20 + 21 23 24 25 26 28 29 + 30 31 32 33 34 35 36 + ) if [[ "$1" == "--extended" ]]; then echo "Using extended input" export IN_PRE=/unix50/extended_input @@ -20,7 +25,7 @@ unix50_bash(){ echo executing Unix50 $(date) | tee "$times_file" echo '' >> "$times_file" - for number in `seq 36` + for number in ${names_scripts[@]} do script="${number}" @@ -30,7 +35,7 @@ unix50_bash(){ outputs_file="${outputs_dir}/${script}.${outputs_suffix}" - echo "${padded_script}" $({ time ./${script}.sh > "$outputs_file"; } 2>&1) | tee -a "$times_file" + echo "${padded_script}" $({ time ./${script}.sh 2> /dev/null > "$outputs_file"; } 2>&1) | tee -a "$times_file" done } @@ -53,7 +58,7 @@ unix50_pash(){ echo executing Unix50 $(date) | tee "$times_file" echo '' >> "$times_file" - for number in `seq 36` + for number in ${names_scripts[@]} do script="${number}" @@ -66,7 +71,7 @@ unix50_pash(){ single_time_file="${outputs_dir}/${script}.${time_suffix}" echo -n "${padded_script}" | tee -a "$times_file" - { time "$PASH_TOP/pa.sh" $flags --log_file "${pash_log}" ${script}.sh > "$outputs_file"; } 2> "${single_time_file}" + { time "$PASH_TOP/pa.sh" $flags --log_file "${pash_log}" ${script}.sh 2> /dev/null > "$outputs_file"; } 2> "${single_time_file}" cat "${single_time_file}" | tee -a "$times_file" done } From 55aee240b0fb7713a2c2d0f67edf1ce79d796306 Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Sun, 12 Jun 2022 03:12:30 +0000 Subject: [PATCH 34/37] small bug --- evaluation/distr_benchmarks/max-temp/run.distr.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/evaluation/distr_benchmarks/max-temp/run.distr.sh b/evaluation/distr_benchmarks/max-temp/run.distr.sh index bdfb10943..7d43a3532 100755 --- a/evaluation/distr_benchmarks/max-temp/run.distr.sh +++ b/evaluation/distr_benchmarks/max-temp/run.distr.sh @@ -10,7 +10,8 @@ max-temp_bash(){ mkdir -p "$outputs_dir" touch "$times_file" - echo executing max temp $(date) | tee -a "$times_file" + cat "$times_file" >> "$times_file".d + echo executing max temp $(date) | tee "$times_file" outputs_file="${outputs_dir}/temp-analytics.${outputs_suffix}" echo "temp-analytics.sh: " $({ time ./temp-analytics.sh > "${outputs_file}"; } 2>&1) | tee -a "$times_file" } From e22dd836ad22aab5707a0e38a4f1a563decbfe3e Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Wed, 15 Jun 2022 02:49:02 +0000 Subject: [PATCH 35/37] fix small issues --- evaluation/distr_benchmarks/nlp/4_3.sh | 1 - evaluation/distr_benchmarks/nlp/8.3_3.sh | 7 ++++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/evaluation/distr_benchmarks/nlp/4_3.sh b/evaluation/distr_benchmarks/nlp/4_3.sh index e817e36b8..3e2d98aef 100755 --- a/evaluation/distr_benchmarks/nlp/4_3.sh +++ b/evaluation/distr_benchmarks/nlp/4_3.sh @@ -7,7 +7,6 @@ IN=${IN:-/nlp/pg/} OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/4_3/} ENTRIES=${ENTRIES:-1060} mkdir -p "$OUT" -echo $ENTRIES pure_func() { input=$1 diff --git a/evaluation/distr_benchmarks/nlp/8.3_3.sh b/evaluation/distr_benchmarks/nlp/8.3_3.sh index 937522b3f..b0df13b9e 100755 --- a/evaluation/distr_benchmarks/nlp/8.3_3.sh +++ b/evaluation/distr_benchmarks/nlp/8.3_3.sh @@ -2,7 +2,7 @@ # tag: compare_exodus_genesis.sh # set -e -IN=${IN:-/nlp/pg/} +IN=${IN:-/nlp/pg} INPUT2=${INPUT2:-/nlp/exodus} OUT=${OUT:-$PASH_TOP/evaluation/distr_benchmarks/nlp/output/8.3_3/} ENTRIES=${ENTRIES:-1060} @@ -10,9 +10,10 @@ mkdir -p $OUT pure_func() { input=$1 + input2=$2 TEMPDIR=$(mktemp -d) cat > ${TEMPDIR}/${input}1.types - hdfs dfs -cat ${INPUT2} | tr -sc '[A-Z][a-z]' '[\012*]' | sort -u > ${TEMPDIR}/${input}2.types + hdfs dfs -cat ${input2} | tr -sc '[A-Z][a-z]' '[\012*]' | sort -u > ${TEMPDIR}/${input}2.types sort ${TEMPDIR}/${input}1.types ${TEMPDIR}/${input}2.types ${TEMPDIR}/${input}2.types | uniq -c | head rm -rf ${TEMPDIR} } @@ -20,7 +21,7 @@ export -f pure_func for input in $(hdfs dfs -ls -C ${IN} | head -n ${ENTRIES} | xargs -n 1 -I arg1 basename arg1) do - hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | sort -u | pure_func $input > ${OUT}/${input}.out + hdfs dfs -cat $IN/$input | tr -c 'A-Za-z' '[\n*]' | grep -v "^\s*$" | sort -u | pure_func $input $INPUT2 > ${OUT}/${input}.out done echo 'done'; From 6fc191d9d53e6ba30f993dc5e4713ff9c74023d9 Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Thu, 16 Jun 2022 01:43:34 +0000 Subject: [PATCH 36/37] change bigrams to be consistant and add hdfs put annotation --- annotations/hdfs.json | 11 +++++++++++ evaluation/distr_benchmarks/oneliners/bi-grams.sh | 5 +++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/annotations/hdfs.json b/annotations/hdfs.json index 60c18da71..1796fa4fd 100644 --- a/annotations/hdfs.json +++ b/annotations/hdfs.json @@ -13,6 +13,17 @@ "outputs": ["stdout"], "comments": "This represents hdfs dfs -cat . Slightly hacky since we only check for -cat" }, + { + "predicate": + { + "operator": "exists", + "operands": ["-put"] + }, + "class": "pure", + "inputs": ["stdin"], + "outputs": ["stdout"], + "comments": "Ideally we would use stdin-hyphen but unfortunatly hdfs put deadlocks on fifo" + }, { "predicate": "default", "class": "side-effectful", diff --git a/evaluation/distr_benchmarks/oneliners/bi-grams.sh b/evaluation/distr_benchmarks/oneliners/bi-grams.sh index a081a05ec..460f565d3 100755 --- a/evaluation/distr_benchmarks/oneliners/bi-grams.sh +++ b/evaluation/distr_benchmarks/oneliners/bi-grams.sh @@ -1,12 +1,13 @@ #!/bin/bash # Find all 2-grams in a piece of text -IN=${IN:-/1G.txt} +IN=${IN:-/oneliners/1G.txt} . bi-gram.aux.sh hdfs dfs -cat $IN | - tr -cs A-Za-z '\n' | + tr -c 'A-Za-z' '[\n*]' | + grep -v "^\s*$" | tr A-Z a-z | bigrams_aux | sort | From 26ce8670248bdcd4bc6a1a03a77fe88379a50a98 Mon Sep 17 00:00:00 2001 From: Tammam Mustafa Date: Thu, 16 Jun 2022 02:17:37 +0000 Subject: [PATCH 37/37] fix leftover merge conflict --- .../distr_benchmarks/nlp/input/setup.sh | 24 ------------------- 1 file changed, 24 deletions(-) diff --git a/evaluation/distr_benchmarks/nlp/input/setup.sh b/evaluation/distr_benchmarks/nlp/input/setup.sh index 2f2e7462b..e523d21a8 100755 --- a/evaluation/distr_benchmarks/nlp/input/setup.sh +++ b/evaluation/distr_benchmarks/nlp/input/setup.sh @@ -5,15 +5,9 @@ PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)} [[ "$1" == "-c" ]] && { rm -rf genesis exodus pg; exit; } if [ ! -f ./genesis ]; then -<<<<<<< Updated upstream curl -sf https://www.gutenberg.org/cache/epub/8001/pg8001.txt > genesis "$PASH_TOP/scripts/append_nl_if_not.sh" genesis fi -======= - curl -sf https://www.gutenberg.org/cache/epub/8001/pg8001.txt > genesis - "$PASH_TOP/scripts/append_nl_if_not.sh" genesis - fi ->>>>>>> Stashed changes if [ ! -f ./exodus ]; then curl -sf https://www.gutenberg.org/files/33420/33420-0.txt > exodus @@ -23,11 +17,7 @@ fi if [ ! -e ./pg ]; then mkdir pg cd pg -<<<<<<< Updated upstream if [[ "$1" == "--full" ]]; then -======= - if [[ "$1" == "--gen-full" ]]; then ->>>>>>> Stashed changes echo 'N.b.: download/extraction will take about 10min' wget ndr.md/data/pg.tar.xz if [ $? -ne 0 ]; then @@ -46,7 +36,6 @@ if [ ! -e ./pg ]; then mv data/* . rm nlp.zip data -rf fi -<<<<<<< Updated upstream for f in *.txt; do "$PASH_TOP/scripts/append_nl_if_not.sh" $f @@ -60,16 +49,3 @@ hdfs dfs -mkdir /nlp hdfs dfs -put exodus /nlp/exodus hdfs dfs -put genesis /nlp/genesis hdfs dfs -put pg /nlp/pg -======= -for f in *.txt; do - "$PASH_TOP/scripts/append_nl_if_not.sh" $f -done - cd .. -fi - - # Put files in hdfs - hdfs dfs -mkdir /nlp - hdfs dfs -put exodus /nlp/exodus - hdfs dfs -put genesis /nlp/genesis - hdfs dfs -put pg /nlp/pg ->>>>>>> Stashed changes