pull_data

#!/bin/bash
#
# Copyright (C) 2022  David Valin dvalin@redhat.com
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#


pbench_results_location=""

retrieve_csv_files()
{
	if [[ $1 == "" ]]; then
		echo Need to designate a results area.
		exit
	fi

	#
	# Grab the top directories
	#
	wget -r -np --spider -l1  -P . http://${pbench_results_location}/${1}/ 2> /dev/null
	pushd ${pbench_results_location}/${1} > /dev/null > /dev/null
	systems=`ls`
	popd > /dev/null
	top_dir=`pwd`
	#
	# Walk through the systems present
	#
	for sys in $systems; do
		wget -r -np --spider -l1  -P . http://${pbench_results_location}/${1}/$sys/ 2> /dev/null
		pushd ${pbench_results_location}/${1}/${sys} > /dev/null
		#
		# Process the directories with the test names.
		#
		for tests in `ls`; do
			pushd $tests > /dev/null
			#
			# Determine the proper test and pull the results file.  Any post processing of
			# the data will be done here.
			#
			if [[ $tests == *"linpack_test"* ]]; then
				wget http://${pbench_results_location}/${1}/$sys/$tests/results_linpack.csv 2> /dev/null
				wget http://${pbench_results_location}/${1}/$sys/$tests/results.txt 2> /dev/null
			fi
			if [[ $tests == *"pig_test"* ]]; then
				wget http://${pbench_results_location}/${1}/$sys/$tests/results_pig.csv 2> /dev/null
			fi
			if [[ $tests == *"streams_test"* ]]; then
				pushd $top_dir > /dev/null
				wget -r -np --spider -l1  -P . http://${pbench_results_location}/${1}/$sys/$tests 2> /dev/null
				popd > /dev/null
				for sdir in `ls -d results_streams_tuned_*`; do
					wget http://${pbench_results_location}/${1}/$sys/$tests/$sdir/streams_results/results_streams.csv 2> /dev/null
				done
			fi
			if [[ $tests == "uperf_tuned"* ]]; then
				#
				# Because uperf is a mess.
				#
				wget http://${pbench_results_location}/${1}/$sys/$tests/result.csv 2> /dev/null
				wget http://${pbench_results_location}/$sys/$tests/result.txt 2> /dev/null
				#
				# We need to post process this file.
				#
				mv result.csv results_uperf.csv
			fi
			if [[ $tests == *"fio_run"* ]]; then
				pushd $top_dir > /dev/null
				wget -r -np --spider -l1  -P . http://${pbench_results_location}/${1}/$sys/$tests 2> /dev/null
				popd > /dev/null
				for sdir in `ls -d fio_bs*`; do
					pushd ${sdir} > /dev/null
					wget http://${pbench_results_location}/${1}/$sys/$tests/$sdir/result.txt 2> /dev/null
					wget http://${pbench_results_location}/${1}/$sys/$tests/$sdir/result.csv 2> /dev/null
					mv result.csv result_fio.csv
					popd > /dev/null
				done
			fi
			if [[ $tests == *"specjbb"* ]]; then
				mkdir -p results/wrapper_results
				cd results/wrapper_results
				wget http://${pbench_results_location}//${1}/$sys/$tests/results/wrapper_results 2> /dev/null
				next_dir=`grep folder.gif wrapper_results  | cut -d'=' -f 5 | cut -d'"' -f 2 | cut -d'/' -f 1`
				mkdir $next_dir
				cd $next_dir
				wget http://${pbench_results_location}/${1}/$sys/$tests/results/wrapper_results/${next_dir} 2> /dev/null
				dir2=`grep folder.gif ${next_dir}  | cut -d'=' -f 5 | cut -d'"' -f 2 | sed "s/\.\///g" | sed "s/\///g"`
				for subdir in $dir2; do
					mkdir $subdir
					cd $subdir
					wget http://${pbench_results_location}/${1}/$sys/$tests/results/wrapper_results/${next_dir}/$subdir/results_specjbb.csv 2> /dev/null
					cd ..
				done
			fi
			if [[ $tests == *"hammerdb"* ]]; then
				wget http://${pbench_results_location}/${1}/$sys/$tests/mariadb 2> /dev/null
				if [ $? -eq 0 ]; then
					wget http://${pbench_results_location}/${1}/$sys/$tests/mariadb/results_hammerdb_maria.csv 2> /dev/null
				fi
				wget http://${pbench_results_location}/${1}/$sys/$tests/postgres 2> /dev/null
				if [ $? -eq 0 ]; then
					wget http://${pbench_results_location}/${1}/$sys/$tests/postgres/results_hammerdb_pg.csv 2> /dev/null
				fi
				wget http://${pbench_results_location}/${1}/$sys/$tests/mssql 2> /dev/null
				if [ $? -eq 0 ]; then
					wget http://${pbench_results_location}/${1}/$sys/$tests/mssql/results_hammerdb_mssql.csv 2> /dev/null
				fi
			fi
			if [[ $tests == *"auto_hpl"* ]]; then
				pushd $top_dir > /dev/null
				wget -r -np --spider -l1 http://${pbench_results_location}/${1}/$sys/$tests/ 2> /dev/null
				popd > /dev/null
				for sdir in `ls -d results_auto_hpl_*`; do
					pushd $sdir > /dev/null
					#
					#  Pulls the index.html file.
					#
					wget http://${pbench_results_location}/${1}/${sys}/${tests}/${sdir} 2> /dev/null

					file_check=""
					if [ -f "index.html" ]; then
						file_check="index.html"
					fi
					if [ -f "${sdir}" ]; then
						file_check=$sdir
					fi
					if [[ $file_check != "" ]]; then
						file_to_pull=`grep csv $file_check | cut -d'<' -f 7 | cut -d'"' -f 2`
						if [[ $file_to_pull != "" ]]; then
							wget http://${pbench_results_location}/${1}/${sys}/${tests}/$sdir/$file_to_pull 2> /dev/null
							file_to=`echo $file_to_pull | cut -d'-' -f 1,2`
							mv $file_to_pull results_${file_to}.csv
						fi
						rm $file_check
					fi
					popd > /dev/null
				done
			fi
			if [[ $tests == *"etcd_run"* ]]; then
				pushd $top_dir > /dev/null
				wget -r -np --spider -l1  -P . http://${pbench_results_location}/${1}/$sys/$tests 2> /dev/null
				popd > /dev/null
				for sdir in `ls -d fio_bs*`; do
					pushd ${sdir} > /dev/null
					wget http://${pbench_results_location}/${1}/$sys/$tests/$sdir/result.txt 2> /dev/null
					wget http://${pbench_results_location}/${1}/$sys/$tests/$sdir/result.csv 2> /dev/null
					mv result.csv result_etcd.csv
					popd > /dev/null
				done

			fi

			if [[ $tests == *"speccpu2017"* ]]; then
				pushd $top_dir > /dev/null
				wget -r -np --spider -l1  -P . http://${pbench_results_location}/${1}/$sys/$tests 2> /dev/null
				popd > /dev/null

				for sdir in `ls -d results_speccpu*`; do
					#
					#  Pulls the index.html file.
					#
					mkdir -p $sdir/run_results
					pushd $sdir/run_results
					wget http://${pbench_results_location}/${1}/${sys}/${tests}/${sdir}/run_results/ 2> /dev/null
					file_check=""
					if [ -f "index.html" ]; then
						file_check="index.html"
					fi
					if [ -f "${sdir}" ]; then
						file_check=$sdir
					fi
					if [[ $file_check != "" ]]; then
						file_to_pull=`grep csv $file_check | cut -d'<' -f 7 | cut -d'"' -f 2`
						echo file_to_pull $file_to_pull
						if [[ $file_to_pull != "" ]]; then
							for get_file in $file_to_pull; do
								wget http://${pbench_results_location}/${1}/${sys}/${tests}/$sdir/run_results/$get_file 2> /dev/null
							done
						fi
						rm $file_check
					fi
					popd
				done
			fi
			popd > /dev/null
		done
		popd > /dev/null
	done
}

#
# Now organize the data based on test then system class.  Also sort them (at least for aws).
#
organize_results()
{
	#
	# List of tests we know about.
	#
	test_names="results_uperf.csv results_specjbb results_pig results_streams results_linpack results_hammerdb_maria results_hammerdb_pg  results_hammerdb_mssql fio_run results_hpl etcd speccpu"
	#
	# Get all the csv files present
	#
	find ${pbench_results_location} -print | grep csv > results_files

	#
	# Get the system prefixes. This is generic.
	#	
	systems=`cut -d'/' -f 4 results_files | sort -u`
	series_list=""
	separ=""
	#
	# If xlarge is in the system name, it is AWS.  We may
	# have to deal with other AWS names later, but worry about
	# it then.
	#
	if [[ $systems == *"xlarge"* ]]; then
		#
		# AWS systems
		#
		sys_type="aws"
		#
		# Build the series list (m5, m5a, i3en...)
		#
		type=`cut -d'/' -f 4 results_files | cut -d'.' -f 1 | sort -u`
		for i in $type; do
			grep ${i}"\." results_files > ${i}_series
			series_list=${series_list}${separ}${i}_series
			separ=" "
		done
	else
		echo $systems is unknown.
		exit 1
	fi
	rm results_locations 2> /dev/null
	rm results_missing_tests 2> /dev/null
	rm results_found_tests 2> /dev/null
	#
	# Walk through the tests and systems and build the appropriate results file
	#
	series_ordered=""
	for tn in $test_names; do
		echo "test ${tn}" >> results_locations
		echo "test ${tn}" >> found_tests
		for series in $series_list; do
			if [[ $sys_type == "aws" ]]; then
				#
				# Obtain the system names, and sort it based on the xlarge value.
				# 2xlarge, 4xlarge ....
				#
				series_ordered=`cut -d/ -f 4 $series | sort -u | sed "s/\./ /g" | sort -n -k2 | sed "s/ /./g"`
			fi
			if [[ $sys_type == "azure" ]]; then
				echo Need to handle azure
				exit
			fi
			#
			# Report the results
			#
			echo "  new_series: $series" >> results_locations
			for sys_checking in $series_ordered; do
				res_file=`grep --no-filename $tn $series_list | grep $sys_checking`
				if [[ $res_file != "" ]]; then
					echo "    ${res_file},${sys_checking}" >> results_locations
					echo "    ${sys_checking}" >> found_tests
				else
					#
					# Test is missing, report it.
					#
					echo Missing $tn for $sys_checking
					echo Missing $tn for $sys_checking >> missing_tests
				fi
			done
		done
	done
}

pbench_results_location=$2
retrieve_csv_files $1
organize_results