How it works

BenchmarkDotNet follows the following steps to run your benchmarks:

BenchmarkRunner generates an isolated project per each runtime settings and builds it in Release mode.
Next, we take each method/job/params combination and try to measure its performance by launching benchmark process several times (LaunchCount).
An invocation of the workload method is an operation. A bunch of operation is an iteration. If you have an IterationSetup method, it will be invoked before each iteration, but not between operations. We have the following type of iterations:
- Jitting: The overhead/workload methods are invoked to ensure they are JIT-compiled (and on tiered runtimes, to promote them when possible). These iterations are not used for measurements.
- Pilot: The best operation count will be chosen.
- OverheadWarmup, OverheadWorkload: BenchmarkDotNet overhead will be evaluated.
- ActualWarmup: Warmup of the workload method.
- ActualWorkload: Actual measurements.
- Result = ActualWorkload - <MedianOverhead>
After all of the measurements, BenchmarkDotNet creates:
- An instance of the Summary class that contains all information about benchmark runs.
- A set of files that contains summary in human-readable and machine-readable formats.
- A set of plots.

Pseudocode

If you don't understand our "count terminology", then you might find following pseudocode useful:

IEnumerable<Results> Run(Benchmark benchmark)
{
    var toolchain = benchmark.GetToolchain();

    var autoGeneratedProject = toolchain.Generate(benchmark);
    var exe = toolchain.Build(autoGeneratedProject);

    foreach (var runIndex in LaunchCount) // LaunchCount = 1 by default
        yield return ParseResults(Process.Start(exe).Output); // calls ActualRun in a separate process
}

Result ActualRun(Method method, Job job)
{
    GlobalSetup();
    JittingStage(method); // triggers JIT compilation (and tiering if enabled) before Pilot/Warmup

    int unrollFactor = job.Run.UnrollFactor; // 16 by default

    long perfectInvocationCount = Pilot(method, unrollFactor);

    WarmupStage(EMPTY_METHOD, perfectInvocationCount, unrollFactor); // EMPTY_METHOD has same return type and arguments as benchmark
    var overhead = ActualStage(EMPTY_METHOD, perfectInvocationCount, unrollFactor);

    WarmupStage(method, perfectInvocationCount, unrollFactor);
    var result = ActualStage(method, perfectInvocationCount);

    if (MemoryDiagnoser.IsEnabled)
        var gcStats = MeasureGcStats(method, perfectInvocationCount, unrollFactor);

    GlobalCleanup(); 

    return (result - Median(overhead), gcStats);
}

void JittingStage(Method method)
{
    RunIteration(method, invokeCount: 1, unrollFactor: 1);
    if (JitInfo.IsTiered)
    {
        for (int i = 0; i < JitInfo.MaxTierPromotions; i++)
        {
            RunIteration(method, invokeCount: JitInfo.TieredCallCountThreshold, unrollFactor: 1);
            Thread.Sleep(250);
        }
    }
}

long Pilot(Method method, int unrollFactor)
{
    // invokeCount is the equivalent of InnerIterationCount from xunit-performance
    long invokeCount = minInvokeCount;

    while (true)
    {
        var measurement = RunIteration(method, invokeCount, unrollFactor);

        if (heuristic.IsPilotRequirementMet(measurement))
            break;

        invokeCount *= 2;
    }

    return invokeCount;
}

void Warmup(Method method, long invokeCount, int unrollFactor)
{
    while (true)
    {
        var measurement = RunIteration(method, invokeCount, unrollFactor);

        if (heuristic.IsWarmupRequirementMet(measurement))
            break;
    }
}

IEnuberable<Measurement> Workload(Method method, long invokeCount, int unrollFactor)
{
    while (true)
    {
        var measurement = RunIteration(method, invokeCount, unrollFactor);

        if (measurement.IsNotOutlier)
            yield return measurement;

        if (heuristic.IsWorkloadRequirementMet(measurement))
            yield break;
    }
}

// every iteration invokes the method (invokeCount / unrollFactor) times
Measurement RunIteration(Method method, long invokeCount, long unrollFactor)
{
    IterationSetup();
    MemoryCleanup();

    var clock = Clock.Start();

    for (long i = 0; i < invokeCount / unrollFactor; i++)
    {
        // we perform manual loop unrolling!!
        method(); // 1st call
        method(); // 2nd call

        method(); // (unrollFactor - 1)'th call
        method(); // unrollFactor'th call
    }

    var clockSpan = clock.GetElapsed();

    IterationCleanup();
    MemoryCleanup();

    return Measurement(clockSpan);
}

GcStats MeasureGcStats(Method method, long invokeCount, long unrollFacto)
{
    // we enable monitoring after workload actual run, for this single iteration which is executed at the end
    // so even if we enable AppDomain monitoring in separate process
    // it does not matter, because we have already obtained the results!
    EnableMonitoring(); 

    IterationSetup();

    var initialGcStats = GcStats.ReadInitial();

    // we do NOT start any clock here, because the enabled monitoring might have some overhead
    // so we just get the gc stats and ignore the timing
    // it's last thing the process does before it dies, so also enabled monitoring is not an issue for next benchmarks
    // because each of them is going to be executed in a new process

    for (long i = 0; i < invokeCount / unrollFactor; i++)
    {
        // we perform manual loop unrolling!!
        method(); // 1st call
        method(); // 2nd call

        method(); // (unrollFactor - 1)'th call
        method(); // unrollFactor'th call
    }

    var finalGcStats = GcStats.ReadFinal();

    IterationCleanup();

    return finalGcStats - initialGcStats; // the result is the difference between the stats collected after and before running the extra iteration
}