project/README.md
from pyspark.sql.functions import col, sum
from pyspark.sql.types import StructType, StructField, StringType, LongType
schema = StructType([
StructField("test_suite", StringType(), True),
StructField("test_name", StringType(), True),
StructField("execution_time_ms", LongType(), True),
StructField("result", StringType(), True)
])
csv_dir = "..."
spark.read.csv(csv_dir, schema=schema) \
.filter(col("execution_time_ms") != -1) \
.groupBy("test_suite") \
.agg((sum("execution_time_ms") / 60000).alias("execution_time_mins")) \
.orderBy(col("execution_time_mins").desc()) \
.limit(50) \
.select("test_suite", "execution_time_mins") \
.show(50, truncate=False)