Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Run PR gpu utests/relvals on both CUDA and ROCm GPUs #2418

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cleanup-cmssdt
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ DIRS="lizard flawfinder invalid-includes cmssw-afs-eos-comparison ubsan_logs ib-
DIRS="${DIRS} check_headers valgrind HLT-Validation ib-static-analysis ib-baseline-tests ib-dqm-tests profiling igprof"
DIRS="${DIRS} iwyu material-budget das_query build-any-ib check-unused-cmsdist-packages class_versions"
DIRS="${DIRS} test-os-alma8 test-os-cs8 test-os-ubi8 test-os-lxplus8 test-os-rhel8 test-os-rocky8 test-os-el8 cms-containers-run-cmssw-test"
DIRS="${DIRS} baseLineComparisonsCUDA baseLineComparisonsROCM"
for dir in ${DIRS}; do
[ -d ${JENKINS_ARTIFACTS}/$dir ] || continue
DIRS_PROCESSED="${DIRS_PROCESSED} ${dir}"
Expand Down
2 changes: 1 addition & 1 deletion pr_testing/run-pr-relvals.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ echo "${MATRIX_ARGS}" | tr ';' '\n' | while IFS= read -r args; do
if [ $(echo "${args}" | sed 's|.*-l ||;s| .*||' | tr ',' '\n' | grep '^all$' | wc -l) -gt 0 ] ; then
OPTS=""
case "${TEST_FLAVOR}" in
gpu ) OPTS="-w gpu" ;;
cuda | rocm ) OPTS="-w gpu" ;;
high_stats ) ;;
nano ) OPTS="-w nano" ;;
* ) ;;
Expand Down
53 changes: 27 additions & 26 deletions pr_testing/run-pr-unittests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,57 +10,58 @@ cd $WORKSPACE/${CMSSW_VERSION}
CMSSW_PKG_COUNT=$(ls -d $LOCALRT/src/*/* | wc -l)
REPORT_OPTS="--report-url ${PR_RESULT_URL} $NO_POST"

rm -f ${RESULTS_DIR}/unittestGPU.txt
mark_commit_status_all_prs 'unittests/gpu' 'pending' -u "${BUILD_URL}" -d "Running tests" || true
rm -f ${RESULTS_DIR}/unittest${GPU_FLAVOR}.txt
mark_commit_status_all_prs "unittests/${GPU_FLAVOR}" 'pending' -u "${BUILD_URL}" -d "Running tests" || true
echo '--------------------------------------'
mkdir -p $WORKSPACE/gpuUnitTests
mkdir -p $WORKSPACE/${GPU_FLAVOR}UnitTests
let UT_TIMEOUT=7200+${CMSSW_PKG_COUNT}*20
UTESTS_CMD="USER_UNIT_TESTS=cuda timeout ${UT_TIMEOUT} scram b -v -k -j ${NCPU} unittests "
gpu_t_lc=$(echo ${GPU_T} | tr '[A-Z]' '[a-z]')
UTESTS_CMD="USER_UNIT_TESTS=${gpu_t_lc} timeout ${UT_TIMEOUT} scram b -v -k -j ${NCPU} unittests "
echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}"
scram build echo_LD_LIBRARY_PATH || true
scram build -r echo_CXX || true
cms_major=$(echo ${CMSSW_IB} | cut -d_ -f2)
cms_minor=$(echo ${CMSSW_IB} | cut -d_ -f3)
cms_ver="$(echo 00${cms_major} | sed -E 's|^.*(..)$|\1|')$(echo 00${cms_minor} | sed -E 's|^.*(..)$|\1|')"
echo $UTESTS_CMD > $WORKSPACE/gpuUnitTests/log.txt
(eval $UTESTS_CMD && echo 'ALL_OK') > $WORKSPACE/gpuUnitTests/log.txt 2>&1 || true
echo $UTESTS_CMD > $WORKSPACE/${GPU_FLAVOR}UnitTests/log.txt
(eval $UTESTS_CMD && echo 'ALL_OK') > $WORKSPACE/${GPU_FLAVOR}UnitTests/log.txt 2>&1 || true
echo 'END OF UNIT TESTS'
echo '--------------------------------------'

TEST_ERRORS=$(grep -ai 'had errors\|recipe for target' $WORKSPACE/gpuUnitTests/log.txt | sed "s|'||g;s|.*recipe for target *||;s|.*unittests_|---> test |;s| failed$| timeout|" || true)
TEST_ERRORS=`grep -ai "had errors" $WORKSPACE/gpuUnitTests/log.txt` || true
GENERAL_ERRORS=`grep -a "ALL_OK" $WORKSPACE/gpuUnitTests/log.txt` || true
TEST_ERRORS=$(grep -ai 'had errors\|recipe for target' $WORKSPACE/${GPU_FLAVOR}UnitTests/log.txt | sed "s|'||g;s|.*recipe for target *||;s|.*unittests_|---> test |;s| failed$| timeout|" || true)
TEST_ERRORS=`grep -ai "had errors" $WORKSPACE/${GPU_FLAVOR}UnitTests/log.txt` || true
GENERAL_ERRORS=`grep -a "ALL_OK" $WORKSPACE/${GPU_FLAVOR}UnitTests/log.txt` || true

if [ "X$TEST_ERRORS" != "X" -o "X$GENERAL_ERRORS" = "X" ]; then
echo "Errors in the gpu unit tests"
echo 'GPU_UNIT_TEST_RESULTS;ERROR,GPU Unit Tests,See Log,gpuUnitTests' >> ${RESULTS_DIR}/unittestGPU.txt
echo "Errors in the ${GPU_FLAVOR} unit tests"
echo "${GPU_FLAVOR}_UNIT_TEST_RESULTS;ERROR,GPU Unit Tests,See Log,${GPU_FLAVOR}UnitTests" >> ${RESULTS_DIR}/unittest${GPU_FLAVOR}.txt
ALL_OK=false
UNIT_TESTS_OK=false
$CMS_BOT_DIR/report-pull-request-results PARSE_GPU_UNIT_TESTS_FAIL -f $WORKSPACE/gpuUnitTests/log.txt --report-file ${RESULTS_DIR}/14-unittestGPU-report.res ${REPORT_OPTS}
echo "GpuUnitTests" > ${RESULTS_DIR}/14-failed.res
$CMS_BOT_DIR/report-pull-request-results PARSE_${GPU_FLAVOR}_UNIT_TESTS_FAIL -f $WORKSPACE/${GPU_FLAVOR}UnitTests/log.txt --report-file ${RESULTS_DIR}/14-unittest${GPU_FLAVOR}-report.res ${REPORT_OPTS}
echo "${GPU_FLAVOR}UnitTests" > ${RESULTS_DIR}/14-failed.res
iarspider marked this conversation as resolved.
Show resolved Hide resolved
else
echo 'GPU_UNIT_TEST_RESULTS;OK,GPU Unit Tests,See Log,gpuUnitTests' >> ${RESULTS_DIR}/unittestGPU.txt
echo "${GPU_FLAVOR}_UNIT_TEST_RESULTS;OK,GPU Unit Tests,See Log,${GPU_FLAVOR}UnitTests" >> ${RESULTS_DIR}/unittest${GPU_FLAVOR}.txt
fi
echo "<html><head></head><body>" > $WORKSPACE/gpuUnitTests/success.html
cp $WORKSPACE/gpuUnitTests/success.html $WORKSPACE/gpuUnitTests/failed.html
echo "<html><head></head><body>" > $WORKSPACE/${GPU_FLAVOR}UnitTests/success.html
cp $WORKSPACE/${GPU_FLAVOR}UnitTests/success.html $WORKSPACE/${GPU_FLAVOR}UnitTests/failed.html
UT_ERR=false
utlog="testing.log"
for t in $(find $WORKSPACE/$CMSSW_IB/tmp/${SCRAM_ARCH}/src -name ${utlog} -type f | sed "s|$WORKSPACE/$CMSSW_IB/tmp/${SCRAM_ARCH}/||;s|/${utlog}$||") ; do
mkdir -p $WORKSPACE/gpuUnitTests/${t}
mv $WORKSPACE/$CMSSW_IB/tmp/${SCRAM_ARCH}/${t}/${utlog} $WORKSPACE/gpuUnitTests/${t}/
if [ $(grep -a '^\-\-\-> test *[^ ]* *succeeded$' $WORKSPACE/gpuUnitTests/${t}/${utlog} | wc -l) -gt 0 ] ; then
echo "<a href='${t}/${utlog}'>${t}</a><br/>" >> $WORKSPACE/gpuUnitTests/success.html
mkdir -p $WORKSPACE/${GPU_FLAVOR}UnitTests/${t}
mv $WORKSPACE/$CMSSW_IB/tmp/${SCRAM_ARCH}/${t}/${utlog} $WORKSPACE/${GPU_FLAVOR}UnitTests/${t}/
if [ $(grep -a '^\-\-\-> test *[^ ]* *succeeded$' $WORKSPACE/${GPU_FLAVOR}UnitTests/${t}/${utlog} | wc -l) -gt 0 ] ; then
echo "<a href='${t}/${utlog}'>${t}</a><br/>" >> $WORKSPACE/${GPU_FLAVOR}UnitTests/success.html
else
echo "<a href='${t}/${utlog}'>${t}</a><br/>" >> $WORKSPACE/gpuUnitTests/failed.html
echo "<a href='${t}/${utlog}'>${t}</a><br/>" >> $WORKSPACE/${GPU_FLAVOR}UnitTests/failed.html
UT_ERR=true
fi
done
if ! $UT_ERR ; then echo "No unit test failed" >> $WORKSPACE/gpuUnitTests/failed.html ; fi
echo "</body></html>" >> $WORKSPACE/gpuUnitTests/success.html
echo "</body></html>" >> $WORKSPACE/gpuUnitTests/failed.html
if ! $UT_ERR ; then echo "No unit test failed" >> $WORKSPACE/${GPU_FLAVOR}UnitTests/failed.html ; fi
echo "</body></html>" >> $WORKSPACE/${GPU_FLAVOR}UnitTests/success.html
echo "</body></html>" >> $WORKSPACE/${GPU_FLAVOR}UnitTests/failed.html
prepare_upload_results
if $UNIT_TESTS_OK ; then
mark_commit_status_all_prs 'unittests/gpu' 'success' -u "${BUILD_URL}" -d "Passed"
mark_commit_status_all_prs "unittests/${GPU_FLAVOR}" 'success' -u "${BUILD_URL}" -d "Passed"
else
mark_commit_status_all_prs 'unittests/gpu' 'error' -u "${BUILD_URL}" -d "Some unit tests were failed."
mark_commit_status_all_prs "unittests/${GPU_FLAVOR}" 'error' -u "${BUILD_URL}" -d "Some unit tests were failed."
fi
29 changes: 26 additions & 3 deletions pr_testing/test_multiple_prs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ if [ $(echo "${CONFIG_LINE}" | grep "PROD_ARCH=1" | wc -l) -gt 0 ] ; then
fi
fi
fi
ALL_GPU_TYPES=("cuda" "rocm")

# ----------
# -- MAIN --
Expand Down Expand Up @@ -379,6 +380,16 @@ if $DO_COMPARISON ; then
grep -v '^\(WORKFLOWS\|MATRIX_ARGS\)=' run-baseline-${BUILD_ID}-01.${ex_type_lc} > run-baseline-${BUILD_ID}-02.${ex_type_lc}
echo "WORKFLOWS=-l ${WF_LIST}" >> run-baseline-${BUILD_ID}-02.${ex_type_lc}
echo "MATRIX_ARGS=${WF_ARGS}" >> run-baseline-${BUILD_ID}-02.${ex_type_lc}
if [ X"${ex_type_lc}" = X"gpu" ]; then
for GPU_T in ${ALL_GPU_TYPES[@]}; do
cp run-baseline-${BUILD_ID}-01.${ex_type_lc} run-baseline-${BUILD_ID}-01.${GPU_T}
sed -i -e "s/TEST_FLAVOR=gpu/TEST_FLAVOR=${GPU_T}/g" run-baseline-${BUILD_ID}-01.${GPU_T}

cp run-baseline-${BUILD_ID}-02.${ex_type_lc} run-baseline-${BUILD_ID}-02.${GPU_T}
sed -i -e "s/TEST_FLAVOR=gpu/TEST_FLAVOR=${GPU_T}/g" run-baseline-${BUILD_ID}-02.${GPU_T}
done
rm run-baseline-${BUILD_ID}-01.${ex_type_lc} run-baseline-${BUILD_ID}-02.${ex_type_lc}
fi
done
popd
send_jenkins_artifacts $WORKSPACE/ib-baseline-tests/ ib-baseline-tests/
Expand Down Expand Up @@ -1314,7 +1325,9 @@ if [ "X$BUILD_OK" = Xtrue -a "$RUN_TESTS" = "true" ]; then
fi
if [ $(echo ${ENABLE_BOT_TESTS} | tr ',' ' ' | tr ' ' '\n' | grep '^GPU$' | wc -l) -gt 0 -a X"${DISABLE_GPU_TESTS}" != X"true" ] ; then
DO_GPU_TESTS=true
mark_commit_status_all_prs 'unittests/gpu' 'pending' -u "${BUILD_URL}" -d "Waiting for tests to start"
for GPU_T in ${ALL_GPU_TYPES[@]} ; do
mark_commit_status_all_prs "unittests/${GPU_T}" 'pending' -u "${BUILD_URL}" -d "Waiting for tests to start"
done
fi
if [ $(echo ${ENABLE_BOT_TESTS} | tr ',' ' ' | tr ' ' '\n' | grep '^HLT_P2_TIMING$' | wc -l) -gt 0 ] ; then
if [ $(echo ${ARCHITECTURE} | grep "_amd64_" | wc -l) -gt 0 ] ; then
Expand Down Expand Up @@ -1456,6 +1469,12 @@ if [ "X$DO_SHORT_MATRIX" = Xtrue ]; then
ex_type_lc=$(echo ${ex_type} | tr '[A-Z]' '[a-z]')
grep -v '^MATRIX_ARGS=' $WORKSPACE/run-relvals.prop > $WORKSPACE/run-relvals-${ex_type_lc}.prop
echo "MATRIX_ARGS=$(get_pr_relval_args $DO_COMPARISON _${ex_type})" >> $WORKSPACE/run-relvals-${ex_type_lc}.prop
if [ "${ex_type_lc}" = "gpu" ]; then
for GPU_T in ${ALL_GPU_TYPES[@]}; do
cp $WORKSPACE/run-relvals-${ex_type_lc}.prop $WORKSPACE/run-relvals-${GPU_T}.prop
done
rm $WORKSPACE/run-relvals-${ex_type_lc}.prop
fi
done
if [ $(runTheMatrix.py --help | grep '^ *--maxSteps' | wc -l) -eq 0 ] ; then
mark_commit_status_all_prs "relvals/input" 'success' -u "${BUILD_URL}" -d "Not ran, runTheMatrix does not support --maxSteps flag" -e
Expand Down Expand Up @@ -1487,7 +1506,10 @@ if [ "X$DO_ADDON_TESTS" = Xtrue ]; then
fi

if [ "X$DO_GPU_TESTS" = Xtrue ]; then
cp $WORKSPACE/test-env.txt $WORKSPACE/run-unittests.prop
for GPU_T in ${ALL_GPU_TYPES[@]}; do
cp $WORKSPACE/test-env.txt $WORKSPACE/run-unittests-${GPU_T}.prop
echo "GPU_FLAVOR=${GPU_T}" >> $WORKSPACE/run-unittests-${GPU_T}.prop
done
fi

if ${BUILD_EXTERNAL} ; then
Expand All @@ -1498,7 +1520,7 @@ fi

if [ "${DO_PROFILING}" = "true" ] ; then
PROFILING_WORKFLOWS=$($CMS_BOT_DIR/cmssw-pr-test-config _PROFILING | tr ',' ' ')
for wf in ${PROFILING_WORKFLOWS};do
for wf in ${PROFILING_WORKFLOWS}; do
cp $WORKSPACE/test-env.txt $WORKSPACE/run-profiling-$wf.prop
echo "PROFILING_WORKFLOWS=${wf}" >> $WORKSPACE/run-profiling-$wf.prop
done
Expand All @@ -1513,3 +1535,4 @@ if [ "${DO_HLT_P2_INTEGRATION}" = "true" ] ; then
fi

rm -f $WORKSPACE/test-env.txt

12 changes: 9 additions & 3 deletions report-pull-request-results.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@ def read_unit_tests_file(unit_tests_file):
send_message_pr(message)


def read_gpu_tests_file(unit_tests_file):
def read_gpu_tests_file(unit_tests_file, gpu_flavor="GPU"):
errors_found = ""
err_cnt = 0
for line in openlog(unit_tests_file):
Expand All @@ -423,8 +423,8 @@ def read_gpu_tests_file(unit_tests_file):
continue
errors_found += line
message = (
"\n## GPU Unit Tests\n\nI found %s errors in the following unit tests:\n\n<pre>%s</pre>"
% (err_cnt, errors_found)
"\n## %s Unit Tests\n\nI found %s errors in the following unit tests:\n\n<pre>%s</pre>"
% (gpu_flavor, err_cnt, errors_found)
)
send_message_pr(message)

Expand Down Expand Up @@ -606,6 +606,8 @@ def complain_missing_param(param_name):
GITLOG_FILE_BASE_URL = "%s/git-recent-commits.json" % options.report_url
GIT_CMS_MERGE_TOPIC_BASE_URL = "%s/git-merge-result" % options.report_url

ACTION = ACTION.upper()

if ACTION == "GET_BASE_MESSAGE":
get_base_message()
elif ACTION == "PARSE_UNIT_TESTS_FAIL":
Expand All @@ -630,6 +632,10 @@ def complain_missing_param(param_name):
read_material_budget_log_file(options.unit_tests_file)
elif ACTION == "MERGE_COMMITS":
add_to_report(get_recent_merges_message())
elif ACTION == "PARSE_CUDA_UNIT_TESTS_FAIL":
read_gpu_tests_file(options.unit_tests_file, "CUDA")
elif ACTION == "PARSE_ROCM_UNIT_TESTS_FAIL":
read_gpu_tests_file(options.unit_tests_file, "ROCm")
elif ACTION == "PARSE_GPU_UNIT_TESTS_FAIL":
read_gpu_tests_file(options.unit_tests_file)
else:
Expand Down
4 changes: 2 additions & 2 deletions run-ib-pr-matrix.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ if [ "${CHECK_WORKFLOWS}" = "true" ] ; then
send_jenkins_artifacts ${WORKSPACE}/workflows-${BUILD_ID}.log ${ARTIFACT_DIR}/workflows-${BUILD_ID}.log
OPTS=""
case "${TEST_FLAVOR}" in
gpu ) OPTS="-w gpu" ;;
cuda | rocm ) OPTS="-w gpu" ;;
high_stats ) ;;
nano ) OPTS="-w nano" ;;
* ) ;;
Expand Down Expand Up @@ -49,7 +49,7 @@ pushd "$WORKSPACE/matrix-results"
CMD_OPTS=""
if ${PRODUCTION_RELEASE} && cmsDriver.py --help | grep -q '\-\-maxmem_profile' ; then CMD_OPTS="--maxmem_profile" ; fi
case "${TEST_FLAVOR}" in
gpu ) MATRIX_ARGS="-w gpu ${MATRIX_ARGS}" ;;
cuda | rocm ) MATRIX_ARGS="-w gpu ${MATRIX_ARGS}" ;;
high_stats ) CMD_OPTS="-n 500" ; MATRIX_ARGS="-i all ${MATRIX_ARGS}" ;;
threading ) MATRIX_ARGS="-i all -t 4 ${MATRIX_ARGS}" ; let NJOBS=(${NJOBS}/4)+1 ;;
nano ) MATRIX_ARGS="-w nano -i all ${MATRIX_ARGS}" ;;
Expand Down