Skip to content

Commit 9fb769e

Browse files
committed
[DEVOPS-3509] test automation: add options for efficient testing
Summary: run_tests_on_spark.py script has three (related) improvements: * When dynamically generating test list from C++ binaries, just collect them locally without a spark job. This avoids the spark overhead and can be done in a short time (~90 second) that would normally be idle waiting for spark resources. * A new ignore_list option that can be used to remove tests from the generated test list. This can be used when re-running a test job that did not complete and the missing tests are unknown. By passing the list of tests already run & passed, the new run can complete the job efficently. * A new disable_list option that can be used to remove tests from the generated test list, and also report them as skipped. This can be used to provide a list of disabled (aka muted) tests that need not be run. Test Plan: Jenkins: build type: fastdebug Additional jenkins work needed to fully exercise new features. Reviewers: jharveysmith Reviewed By: jharveysmith Subscribers: devops Differential Revision: https://phorge.dev.yugabyte.com/D46553
1 parent 97155bb commit 9fb769e

File tree

5 files changed

+154
-103
lines changed

5 files changed

+154
-103
lines changed

build-support/common-test-env.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1359,7 +1359,7 @@ run_tests_on_spark() {
13591359
time "$spark_submit_cmd_path" \
13601360
--driver-cores "$INITIAL_SPARK_DRIVER_CORES" \
13611361
"$YB_SCRIPT_PATH_RUN_TESTS_ON_SPARK" \
1362-
"${run_tests_args[@]}" "$@" 2>&1 | \
1362+
"${run_tests_args[@]}" 2>&1 | \
13631363
grep -Ev "TaskSetManager: (Starting task|Finished task .* \([0-9]+[1-9]/[0-9]+\))" \
13641364
--line-buffered
13651365
exit "${PIPESTATUS[0]}"

build-support/jenkins/yb-jenkins-test.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,14 @@ if [[ ${YB_COMPILE_ONLY} != "1" ]]; then
188188
run_tests_extra_args+=( "--test_conf" "${test_conf_path}" )
189189
fi
190190

191+
if [[ -n "${SPARK_IGNORE_FILE:-}" && -f "$SPARK_IGNORE_FILE" ]]; then
192+
log "Using ignore list file: $SPARK_IGNORE_FILE"
193+
run_tests_extra_args+=( "--ignore_list" "$SPARK_IGNORE_FILE" )
194+
fi
195+
if [[ -n "${SPARK_DISABLE_FILE:-}" && -f "$SPARK_DISABLE_FILE" ]]; then
196+
log "Using disable list file: $SPARK_DISABLE_FILE"
197+
run_tests_extra_args+=( "--disable_list" "$SPARK_DISABLE_FILE" )
198+
fi
191199
run_tests_extra_args+=( "--send_archive_to_workers" )
192200

193201
# Workers use /private path, which caused mis-match when check is done by yb_dist_tests that

python/yugabyte/csi_report.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ def close_item(item: str, time_sec: float, status: str, tags: List[str]) -> str:
276276
if status == 'skipped':
277277
# Do not mark skipped items as needing investigation.
278278
req_data['issue'] = {"issueType": "NOT_ISSUE"}
279-
logging.info("CSI close: " + item)
279+
logging.info(f"CSI close ({status}): {item}")
280280
response = requests.put(csi['url'] + '/item/' + item,
281281
headers=csi['headers'],
282282
data=json.dumps(req_data))

python/yugabyte/run_tests_on_spark.py

Lines changed: 136 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -668,41 +668,17 @@ def initialize_remote_task() -> yb_dist_tests.GlobalTestConfig:
668668
return global_conf
669669

670670

671-
def parallel_list_test_descriptors(rel_test_path: str) -> Tuple[List[str], float]:
671+
def list_test_descriptors(rel_test_path: str) -> List[str]:
672672
"""
673-
This is invoked in parallel to list all individual tests within our C++ test programs. Without
674-
this, listing all gtest tests across 330 test programs might take about 5 minutes on TSAN and 2
675-
minutes in debug.
673+
This function lists all individual tests within a C++ test program.
676674
"""
677-
# TODO: We should change this from a parallel spark-job stage to just do it serially on the
678-
# jenkins worker host.
679-
# Rationale: The time may be longer than when the above comment was written, since we have more
680-
# tests, but it is still a good trade-off.
681-
# To run the tasks on spark workers, the workspace archive has to be copied over and unpacked,
682-
# which takes a couple minutes by itself. That cost will have to be paid anyway when second
683-
# stage comes to run the tests, but some of the nodes are re-assigned to other jobs while stage
684-
# 0 is being completed, and the cost is wasted for those nodes.
685-
# Having nodes shuffled off the job while last few tasks of stage 0 are running means that
686-
# resources are assigned somewhat out of order and require more switching overhead.
687-
# Changing the jobs from 2 stages to a single stage allows much better knowledge of how many
688-
# tasks the job is actually going to run, instead of finding out only after stage 0 is complete.
689-
# This would allow much easier cluster scaling calculations.
690-
# Finally, the submitting node is generally doing nothing but waiting in a spark queue, during
691-
# that waiting time stage 0 could already be done and reduce the resource contintion of the
692-
# spark cluster workers.
693-
694-
start_time_sec = time.time()
695-
696-
from yugabyte import yb_dist_tests, command_util
697-
global_conf = initialize_remote_task()
698-
675+
global_conf = yb_dist_tests.get_global_conf()
699676
os.environ['BUILD_ROOT'] = global_conf.build_root
700677
if not os.path.isdir(os.environ['YB_THIRDPARTY_DIR']):
701678
find_or_download_thirdparty_script_path = os.path.join(
702679
global_conf.yb_src_root, 'build-support', 'find_or_download_thirdparty.sh')
703680
subprocess.check_call(find_or_download_thirdparty_script_path)
704681

705-
wait_for_path_to_exist(global_conf.build_root)
706682
list_tests_cmd_line = [
707683
os.path.join(global_conf.build_root, rel_test_path), '--gtest_list_tests']
708684

@@ -748,7 +724,7 @@ def parallel_list_test_descriptors(rel_test_path: str) -> Tuple[List[str], float
748724
else:
749725
current_test = trimmed_line
750726

751-
return test_descriptors, time.time() - start_time_sec
727+
return test_descriptors
752728

753729

754730
def get_username() -> str:
@@ -1015,26 +991,14 @@ def collect_cpp_tests(
1015991
else:
1016992
app_name_details = ['{} test programs'.format(len(all_test_programs))]
1017993

1018-
init_spark_context(app_name_details)
1019-
set_global_conf_for_spark_jobs()
1020-
1021-
# Use fewer "slices" (tasks) than there are test programs, in hope to get some batching.
1022-
num_slices = (len(test_programs) + 1) / 2
1023-
assert spark_context is not None
1024-
test_descriptor_lists_and_times: List[Tuple[List[str], float]] = run_spark_action(
1025-
lambda: spark_context.parallelize( # type: ignore
1026-
test_programs, numSlices=num_slices).map(parallel_list_test_descriptors).collect()
1027-
)
1028-
total_elapsed_time_sec = sum([t[1] for t in test_descriptor_lists_and_times])
994+
test_descriptor_strs = one_shot_test_programs
995+
for test_program in test_programs:
996+
test_descriptor_strs.extend(list_test_descriptors(test_program))
1029997

1030998
elapsed_time_sec = time.time() - start_time_sec
1031-
test_descriptor_strs = one_shot_test_programs + functools.reduce(
1032-
operator.add, [t[0] for t in test_descriptor_lists_and_times], [])
1033999
logging.info(
10341000
f"Collected the list of {len(test_descriptor_strs)} gtest tests in "
1035-
f"{elapsed_time_sec:.2f} sec wallclock time, total time spent on Spark workers: "
1036-
f"{total_elapsed_time_sec:.2f} sec, average time per test program: "
1037-
f"{total_elapsed_time_sec / len(test_programs):.2f} sec")
1001+
f"{elapsed_time_sec:.2f} sec wallclock time")
10381002
for test_descriptor_str in test_descriptor_strs:
10391003
if 'YB_DISABLE_TEST_IN_' in test_descriptor_str:
10401004
raise RuntimeError(
@@ -1128,7 +1092,7 @@ def collect_tests(args: argparse.Namespace) -> List[yb_dist_tests.TestDescriptor
11281092

11291093

11301094
def load_test_list(test_list_path: str) -> List[yb_dist_tests.TestDescriptor]:
1131-
logging.info("Loading the list of tests to run from %s", test_list_path)
1095+
logging.info("Loading test list from %s", test_list_path)
11321096
test_descriptors = []
11331097
with open(test_list_path, 'r') as input_file:
11341098
for line in input_file:
@@ -1171,6 +1135,41 @@ def run_spark_action(action: Any) -> Any:
11711135
return results
11721136

11731137

1138+
def report_skipped_test(test_descriptor: yb_dist_tests.TestDescriptor) -> None:
1139+
suite_var_name = 'YB_CSI_' + test_descriptor.language
1140+
skip_time = time.time()
1141+
os.environ[suite_var_name] = propagated_env_vars[suite_var_name]
1142+
csi_id = csi_report.create_test(test_descriptor, skip_time, 0)
1143+
csi_report.close_item(csi_id, skip_time, 'skipped', ['muted'])
1144+
1145+
1146+
def skip_disabled_tests(test_descriptors: List[yb_dist_tests.TestDescriptor],
1147+
disable_list_path: str) -> List[yb_dist_tests.TestDescriptor]:
1148+
1149+
from yugabyte.test_descriptor import SimpleTestDescriptor
1150+
1151+
logging.info("Loading simple test list from %s", disable_list_path)
1152+
disabled_tests = []
1153+
with open(disable_list_path, 'r') as input_file:
1154+
for line in input_file:
1155+
line = line.strip()
1156+
if line:
1157+
disabled_tests.append(SimpleTestDescriptor.parse(line))
1158+
logging.info("Loaded %d disabled tests from %s", len(disabled_tests), disable_list_path)
1159+
1160+
enabled = []
1161+
for td in test_descriptors:
1162+
simple = SimpleTestDescriptor.parse(td.descriptor_str)
1163+
for disabled in disabled_tests:
1164+
if disabled.matches(simple):
1165+
report_skipped_test(td)
1166+
break
1167+
else:
1168+
enabled.append(td)
1169+
logging.info("Remaining enabled tests: %d", len(enabled))
1170+
return enabled
1171+
1172+
11741173
def main() -> None:
11751174
parser = argparse.ArgumentParser(
11761175
description='Run tests on Spark.')
@@ -1186,6 +1185,17 @@ def main() -> None:
11861185
metavar='TEST_LIST_FILE',
11871186
help='A file with a list of tests to run. Useful when e.g. re-running '
11881187
'failed tests using a file produced with --failed_test_list.')
1188+
parser.add_argument('--ignore_list',
1189+
metavar='TEST_LIST_FILE',
1190+
help='A file with a list of tests to ignore. '
1191+
'These tests will not be run or reported. Useful to avoid '
1192+
're-running passed tests of an incomplete run.')
1193+
parser.add_argument('--disable_list',
1194+
metavar='TEST_LIST_FILE',
1195+
help='A file with a list of tests to skip. '
1196+
'Uses test class/name, and may omit cxx_rel_test_binary. '
1197+
'These tests will not be run but will report as skipped. '
1198+
'Useful to avoid running known flakey/broken tests.')
11891199
parser.add_argument('--build-root', dest='build_root', required=True,
11901200
help='Build root (e.g. ~/code/yugabyte/build/debug-gcc-dynamic-community)')
11911201
parser.add_argument('--max-tests', type=int, dest='max_tests',
@@ -1287,6 +1297,16 @@ def main() -> None:
12871297
fatal_error("File specified by --test_list does not exist or is not a file: '{}'".format(
12881298
test_list_path))
12891299

1300+
ignore_list_path = args.ignore_list
1301+
if ignore_list_path and not os.path.isfile(ignore_list_path):
1302+
fatal_error("File specified by --ignore_list does not exist or is not a file: '{}'".format(
1303+
ignore_list_path))
1304+
1305+
disable_list_path = args.disable_list
1306+
if disable_list_path and not os.path.isfile(disable_list_path):
1307+
fatal_error("File specified by --disable_list does not exist or is not a file: '{}'".format(
1308+
disable_list_path))
1309+
12901310
global_conf = yb_dist_tests.get_global_conf()
12911311
if ('YB_MVN_LOCAL_REPO' not in os.environ and
12921312
args.run_java_tests and
@@ -1323,8 +1343,42 @@ def main() -> None:
13231343
# Start the timer.
13241344
global_start_time = time.time()
13251345

1326-
# This needs to be done before Spark context initialization, which will happen as we try to
1327-
# collect all gtest tests in all C++ test programs.
1346+
propagate_env_vars()
1347+
collect_tests_time_sec: Optional[float] = None
1348+
if test_list_path:
1349+
test_descriptors = load_test_list(test_list_path)
1350+
else:
1351+
collect_tests_start_time_sec = time.time()
1352+
test_descriptors = collect_tests(args)
1353+
collect_tests_time_sec = time.time() - collect_tests_start_time_sec
1354+
1355+
# Includes ignored tests, assuming they already ran/passed.
1356+
initial_num = len(test_descriptors)
1357+
logging.info("Initial number of tests: %d", initial_num)
1358+
num_repetitions = args.num_repetitions
1359+
# For now, we are just dividing test reports into two suites by language.
1360+
# Total number per suite planned also includes ignored & disabled tests.
1361+
num_planned_by_language: Dict[str, int] = defaultdict(int)
1362+
for td in test_descriptors:
1363+
if td.attempt_index == 1:
1364+
num_planned_by_language[td.language] += 1
1365+
1366+
if args.save_report_to_build_dir:
1367+
planned_report_paths = []
1368+
planned_report_paths.append(os.path.join(global_conf.build_root, 'planned_tests.json'))
1369+
planned = []
1370+
for td in test_descriptors:
1371+
planned.append(td.descriptor_str)
1372+
save_json_to_paths('planned tests', planned, planned_report_paths, should_gzip=False)
1373+
1374+
if ignore_list_path:
1375+
ignored_tests = load_test_list(ignore_list_path)
1376+
test_descriptors = [td for td in test_descriptors if td not in ignored_tests]
1377+
logging.info("Ignoring %d tests from ignore list %s", initial_num - len(test_descriptors),
1378+
ignore_list_path)
1379+
1380+
# This needs to be done before Spark context initialization.
1381+
# And before the no-tests-to-run check, so that archive can be pre-built.
13281382
if args.send_archive_to_workers:
13291383
archive_exists = (
13301384
global_conf.archive_for_workers is not None and
@@ -1346,36 +1400,54 @@ def main() -> None:
13461400
else:
13471401
yb_dist_tests.compute_archive_sha256sum()
13481402

1349-
propagate_env_vars()
1350-
collect_tests_time_sec: Optional[float] = None
1351-
if test_list_path:
1352-
test_descriptors = load_test_list(test_list_path)
1353-
else:
1354-
collect_tests_start_time_sec = time.time()
1355-
test_descriptors = collect_tests(args)
1356-
collect_tests_time_sec = time.time() - collect_tests_start_time_sec
1357-
13581403
if not test_descriptors and not args.allow_no_tests:
13591404
logging.info("No tests to run")
13601405
return
13611406

1362-
num_tests = len(test_descriptors)
1407+
# If CSI is enabled, we need suites created before reporting any disabled tests.
1408+
if test_descriptors:
1409+
csi_suites: Dict[str, str] = {}
1410+
propagated_env_vars['YB_CSI_REPS'] = str(num_repetitions)
1411+
logging.info("Propagating env var %s (value: %s) to Spark workers",
1412+
'YB_CSI_REPS', num_repetitions)
1413+
csi_lqid = csi_report.launch_qid()
1414+
propagated_env_vars['YB_CSI_QID'] = csi_lqid
1415+
logging.info("Propagating env var %s (value: %s) to Spark workers",
1416+
'YB_CSI_QID', csi_lqid)
1417+
for suite_name, num_tests in num_planned_by_language.items():
1418+
if args.test_filter_re:
1419+
method = "RegEx"
1420+
elif args.test_list:
1421+
method = "Requested"
1422+
else:
1423+
method = "All"
1424+
(csi_var_name, csi_var_value) = csi_report.create_suite(
1425+
csi_lqid, suite_name, os.getenv('YB_CSI_SUITE', ''), method, num_tests,
1426+
num_repetitions, time.time())
1427+
csi_suites[suite_name] = csi_var_value
1428+
propagated_env_vars[csi_var_name] = csi_var_value
1429+
logging.info("Propagating env var %s (value: %s) to Spark workers",
1430+
csi_var_name, csi_var_value)
1431+
1432+
if disable_list_path:
1433+
test_descriptors = skip_disabled_tests(test_descriptors, disable_list_path)
13631434

1364-
if args.max_tests and num_tests > args.max_tests:
1435+
# Actual number to be run after ignored & disabled.
1436+
spark_test_cnt = len(test_descriptors)
1437+
if args.max_tests and spark_test_cnt > args.max_tests:
13651438
logging.info("Randomly selecting {} tests out of {} possible".format(
1366-
args.max_tests, num_tests))
1439+
args.max_tests, spark_test_cnt))
13671440
random.shuffle(test_descriptors)
13681441
test_descriptors = test_descriptors[:args.max_tests]
1369-
num_tests = len(test_descriptors)
1442+
spark_test_cnt = len(test_descriptors)
13701443

13711444
if args.verbose:
13721445
for test_descriptor in test_descriptors:
13731446
logging.info("Will run test: {}".format(test_descriptor))
13741447

1375-
num_repetitions = args.num_repetitions
1376-
total_num_tests = num_tests * num_repetitions
1448+
total_num_tests = spark_test_cnt * num_repetitions
13771449
logging.info("Running {} tests on Spark, {} times each, for a total of {} tests".format(
1378-
num_tests, num_repetitions, total_num_tests))
1450+
spark_test_cnt, num_repetitions, total_num_tests))
13791451

13801452
if num_repetitions > 1:
13811453
test_descriptors = [
@@ -1384,17 +1456,9 @@ def main() -> None:
13841456
for i in range(1, num_repetitions + 1)
13851457
]
13861458

1387-
if args.save_report_to_build_dir:
1388-
planned_report_paths = []
1389-
planned_report_paths.append(os.path.join(global_conf.build_root, 'planned_tests.json'))
1390-
planned = []
1391-
for td in test_descriptors:
1392-
planned.append(td.descriptor_str)
1393-
save_json_to_paths('planned tests', planned, planned_report_paths, should_gzip=False)
1394-
13951459
app_name_details = ['{} tests total'.format(total_num_tests)]
13961460
if num_repetitions > 1:
1397-
app_name_details += ['{} repetitions of {} tests'.format(num_repetitions, num_tests)]
1461+
app_name_details += ['{} repetitions of {} tests'.format(num_repetitions, spark_test_cnt)]
13981462
init_spark_context(app_name_details)
13991463

14001464
set_global_conf_for_spark_jobs()
@@ -1439,35 +1503,6 @@ def monitor_fail_count(stop_event: threading.Event) -> None:
14391503
total_num_tests, len(test_descriptors))
14401504
test_phase_start_time = time.time()
14411505

1442-
# For now, we are just dividing test reports into two suites by language.
1443-
num_planned_by_language: Dict[str, int] = defaultdict(int)
1444-
for td in test_descriptors:
1445-
if td.attempt_index == 1:
1446-
num_planned_by_language[td.language] += 1
1447-
1448-
csi_suites: Dict[str, str] = {}
1449-
propagated_env_vars['YB_CSI_REPS'] = str(num_repetitions)
1450-
logging.info("Propagating env var %s (value: %s) to Spark workers",
1451-
'YB_CSI_REPS', num_repetitions)
1452-
csi_lqid = csi_report.launch_qid()
1453-
propagated_env_vars['YB_CSI_QID'] = csi_lqid
1454-
logging.info("Propagating env var %s (value: %s) to Spark workers",
1455-
'YB_CSI_QID', csi_lqid)
1456-
for suite_name, num_tests in num_planned_by_language.items():
1457-
if args.test_filter_re:
1458-
method = "RegEx"
1459-
elif args.test_list:
1460-
method = "Requested"
1461-
else:
1462-
method = "All"
1463-
(csi_var_name, csi_var_value) = csi_report.create_suite(
1464-
csi_lqid, suite_name, os.getenv('YB_CSI_SUITE', ''), method, num_tests,
1465-
num_repetitions, test_phase_start_time)
1466-
csi_suites[suite_name] = csi_var_value
1467-
propagated_env_vars[csi_var_name] = csi_var_value
1468-
logging.info("Propagating env var %s (value: %s) to Spark workers",
1469-
csi_var_name, csi_var_value)
1470-
14711506
# Randomize test order to avoid any kind of skew.
14721507
random.shuffle(test_descriptors)
14731508
test_names_rdd = spark_context.parallelize( # type: ignore

python/yugabyte/test_descriptor.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,14 @@ def __eq__(self, other: object) -> bool:
249249
self.class_name == other_test_descriptor.class_name and
250250
self.test_name == other_test_descriptor.test_name)
251251

252+
# Check name but not binary
253+
def matches(self, other: object) -> bool:
254+
other_test_descriptor = cast(SimpleTestDescriptor, other)
255+
return (
256+
self.language == other_test_descriptor.language and
257+
self.class_name == other_test_descriptor.class_name and
258+
self.test_name == other_test_descriptor.test_name)
259+
252260
def __hash__(self) -> int:
253261
return hash((
254262
self.language,

0 commit comments

Comments
 (0)