From f16786ba7971b3599c150f6ad867ec4b80e9b392 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20M=C3=BCller?= <schnitzeltony@gmail.com>
Date: Tue, 1 Mar 2022 22:53:18 +0100
Subject: [PATCH] mixxx: upgrade 2.3.1 -> 2.3.2 to fix build with ffmpeg5
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Andreas Müller <schnitzeltony@gmail.com>
---
 recipes-musicians/mixxx/mixxx.bb              |   11 +-
 ...1-Update-Benchmark-library-to-v1.6.0.patch | 5933 -----------------
 2 files changed, 4 insertions(+), 5940 deletions(-)
 delete mode 100644 recipes-musicians/mixxx/mixxx/0001-Update-Benchmark-library-to-v1.6.0.patch

diff --git a/recipes-musicians/mixxx/mixxx.bb b/recipes-musicians/mixxx/mixxx.bb
index c17ddee..c7cc8d5 100644
--- a/recipes-musicians/mixxx/mixxx.bb
+++ b/recipes-musicians/mixxx/mixxx.bb
@@ -1,7 +1,7 @@
 SUMMARY = "Qt based DJ software"
 HOMEPAGE = "http://mixxx.org/"
 LICENSE = "GPLv2+"
-LIC_FILES_CHKSUM = "file://LICENSE;md5=e5323335634095f8bdd15f6a5c5c5865"
+LIC_FILES_CHKSUM = "file://LICENSE;md5=b3ce5d18079fa79804cd62469a51d176"
 
 inherit cmake_qt5 gtk-icon-cache features_check
 
@@ -44,13 +44,10 @@ DEPENDS += " \
 # causes segfault trying to find debug libs
 #    gperftools
 
-SRC_URI = " \
-    git://github.com/mixxxdj/${BPN}.git;branch=main;protocol=https \
-    file://0001-Update-Benchmark-library-to-v1.6.0.patch \
-"
-SRCREV = "8acb633220024222504cddcd1f5ea26e659fbcc7"
+SRC_URI = "git://github.com/mixxxdj/${BPN}.git;branch=main;protocol=https"
+SRCREV = "96fc5dd217a81d0e2327a52f564f7aea7d5c2c43"
 S = "${WORKDIR}/git"
-PV = "2.3.1"
+PV = "2.3.2"
 
 EXTRA_OECMAKE += " \
     -DSHOUTCAST=OFF \
diff --git a/recipes-musicians/mixxx/mixxx/0001-Update-Benchmark-library-to-v1.6.0.patch b/recipes-musicians/mixxx/mixxx/0001-Update-Benchmark-library-to-v1.6.0.patch
deleted file mode 100644
index 52a46fc..0000000
--- a/recipes-musicians/mixxx/mixxx/0001-Update-Benchmark-library-to-v1.6.0.patch
+++ /dev/null
@@ -1,5933 +0,0 @@
-From e590711ca2a0882e331162d14405e25c49f7b774 Mon Sep 17 00:00:00 2001
-From: Uwe Klotz <uklotz@mixxx.org>
-Date: Tue, 23 Nov 2021 23:36:21 +0100
-Subject: [PATCH] Update Benchmark library to v1.6.0
-
-
-Upstream-Status: Accepted [https://github.com/mixxxdj/mixxx/commit/e590711ca2a0882e331162d14405e25c49f7b774]
----
- lib/benchmark/AUTHORS                        |   4 +
- lib/benchmark/CMakeLists.txt                 |  50 +-
- lib/benchmark/COMMIT                         |   2 +-
- lib/benchmark/CONTRIBUTING.md                |  58 ++
- lib/benchmark/CONTRIBUTORS                   |   7 +
- lib/benchmark/README.md                      | 216 +++++
- lib/benchmark/cmake/AddCXXCompilerFlag.cmake |  12 +-
- lib/benchmark/cmake/CXXFeatureCheck.cmake    |   5 +
- lib/benchmark/cmake/GetGitVersion.cmake      |  22 +-
- lib/benchmark/cmake/GoogleTest.cmake         |   3 +
- lib/benchmark/cmake/GoogleTest.cmake.in      |   2 +-
- lib/benchmark/cmake/Modules/FindPFM.cmake    |  26 +
- lib/benchmark/cmake/benchmark.pc.in          |   4 +-
- lib/benchmark/include/benchmark/benchmark.h  | 250 ++++--
- lib/benchmark/src/CMakeLists.txt             |  14 +-
- lib/benchmark/src/benchmark.cc               | 258 ++++--
- lib/benchmark/src/benchmark_api_internal.cc  |  93 ++-
- lib/benchmark/src/benchmark_api_internal.h   |  71 +-
- lib/benchmark/src/benchmark_register.cc      | 209 +++--
- lib/benchmark/src/benchmark_register.h       |  23 +-
- lib/benchmark/src/benchmark_runner.cc        | 401 +++++-----
- lib/benchmark/src/benchmark_runner.h         |  73 +-
- lib/benchmark/src/check.h                    |  39 +-
- lib/benchmark/src/colorprint.cc              |   4 +-
- lib/benchmark/src/commandlineflags.cc        |  60 +-
- lib/benchmark/src/commandlineflags.h         |  73 +-
- lib/benchmark/src/complexity.cc              |  19 +-
- lib/benchmark/src/console_reporter.cc        |  21 +-
- lib/benchmark/src/csv_reporter.cc            |   3 +-
- lib/benchmark/src/cycleclock.h               |  67 +-
- lib/benchmark/src/internal_macros.h          |  10 +-
- lib/benchmark/src/json_reporter.cc           | 100 ++-
- lib/benchmark/src/log.h                      |   2 +-
- lib/benchmark/src/mutex.h                    |  44 +-
- lib/benchmark/src/perf_counters.cc           | 132 +++
- lib/benchmark/src/perf_counters.h            | 172 ++++
- lib/benchmark/src/re.h                       |   2 +-
- lib/benchmark/src/reporter.cc                |  15 +-
- lib/benchmark/src/sleep.cc                   |  16 +
- lib/benchmark/src/statistics.cc              |  36 +-
- lib/benchmark/src/statistics.h               |   1 +
- lib/benchmark/src/string_util.cc             |  16 +
- lib/benchmark/src/string_util.h              |   2 +
- lib/benchmark/src/sysinfo.cc                 |  58 +-
- lib/benchmark/src/thread_timer.h             |   8 +-
- lib/benchmark/src/timers.cc                  |  80 +-
- lib/benchmark/tools/compare.py               |  37 +-
- lib/benchmark/tools/gbench/report.py         | 799 +++++++++++++++----
- lib/benchmark/tools/gbench/util.py           |  23 +-
- 49 files changed, 2754 insertions(+), 888 deletions(-)
- create mode 100644 lib/benchmark/CONTRIBUTING.md
- create mode 100644 lib/benchmark/README.md
- create mode 100644 lib/benchmark/cmake/Modules/FindPFM.cmake
- create mode 100644 lib/benchmark/src/perf_counters.cc
- create mode 100644 lib/benchmark/src/perf_counters.h
-
-diff --git a/lib/benchmark/AUTHORS b/lib/benchmark/AUTHORS
-index 89205a1adb..838dd4f5bd 100644
---- a/lib/benchmark/AUTHORS
-+++ b/lib/benchmark/AUTHORS
-@@ -13,6 +13,7 @@ Alex Steele <steeleal123@gmail.com>
- Andriy Berestovskyy <berestovskyy@gmail.com>
- Arne Beer <arne@twobeer.de>
- Carto
-+Christian Wassermann <christian_wassermann@web.de>
- Christopher Seymour <chris.j.seymour@hotmail.com>
- Colin Braley <braley.colin@gmail.com>
- Daniel Harvey <danielharvey458@gmail.com>
-@@ -42,6 +43,7 @@ Matt Clarkson <mattyclarkson@gmail.com>
- Maxim Vafin <maxvafin@gmail.com>
- MongoDB Inc.
- Nick Hutchinson <nshutchinson@gmail.com>
-+Norman Heino <norman.heino@gmail.com>
- Oleksandr Sochka <sasha.sochka@gmail.com>
- Ori Livneh <ori.livneh@gmail.com>
- Paul Redmond <paul.redmond@gmail.com>
-@@ -51,6 +53,8 @@ Sayan Bhattacharjee <aero.sayan@gmail.com>
- Shuo Chen <chenshuo@chenshuo.com>
- Steinar H. Gunderson <sgunderson@bigfoot.com>
- Stripe, Inc.
-+Tobias Schmidt <tobias.schmidt@in.tum.de>
- Yixuan Qiu <yixuanq@gmail.com>
- Yusuke Suzuki <utatane.tea@gmail.com>
- Zbigniew Skowron <zbychs@gmail.com>
-+Min-Yih Hsu <yihshyng223@gmail.com>
-diff --git a/lib/benchmark/CMakeLists.txt b/lib/benchmark/CMakeLists.txt
-index 67c0b70015..49f2ae2a0f 100644
---- a/lib/benchmark/CMakeLists.txt
-+++ b/lib/benchmark/CMakeLists.txt
-@@ -13,7 +13,7 @@ foreach(p
-   endif()
- endforeach()
- 
--project (benchmark CXX)
-+project (benchmark VERSION 1.6.0 LANGUAGES CXX)
- 
- option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
- option(BENCHMARK_ENABLE_EXCEPTIONS "Enable the use of exceptions in the benchmark library." ON)
-@@ -34,7 +34,20 @@ option(BENCHMARK_DOWNLOAD_DEPENDENCIES "Allow the downloading and in-tree buildi
- # in cases where it is not possible to build or find a valid version of gtest.
- option(BENCHMARK_ENABLE_GTEST_TESTS "Enable building the unit tests which depend on gtest" ON)
- 
-+option(BENCHMARK_ENABLE_LIBPFM "Enable performance counters provided by libpfm" OFF)
-+
- set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
-+if(MSVC)
-+    # As of CMake 3.18, CMAKE_SYSTEM_PROCESSOR is not set properly for MSVC and
-+    # cross-compilation (e.g. Host=x86_64, target=aarch64) requires using the
-+    # undocumented, but working variable.
-+    # See https://gitlab.kitware.com/cmake/cmake/-/issues/15170
-+    set(CMAKE_SYSTEM_PROCESSOR ${MSVC_CXX_ARCHITECTURE_ID})
-+    if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ARM")
-+      set(CMAKE_CROSSCOMPILING TRUE)
-+    endif()
-+endif()
-+
- set(ENABLE_ASSEMBLY_TESTS_DEFAULT OFF)
- function(should_enable_assembly_tests)
-   if(CMAKE_BUILD_TYPE)
-@@ -81,8 +94,14 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
- include(GetGitVersion)
- get_git_version(GIT_VERSION)
- 
-+# If no git version can be determined, use the version
-+# from the project() command
-+if ("${GIT_VERSION}" STREQUAL "0.0.0")
-+  set(VERSION "${benchmark_VERSION}")
-+else()
-+  set(VERSION "${GIT_VERSION}")
-+endif()
- # Tell the user what versions we are using
--string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" VERSION ${GIT_VERSION})
- message(STATUS "Version: ${VERSION}")
- 
- # The version of the libraries
-@@ -144,9 +163,12 @@ else()
-   add_cxx_compiler_flag(-Werror RELEASE)
-   add_cxx_compiler_flag(-Werror RELWITHDEBINFO)
-   add_cxx_compiler_flag(-Werror MINSIZEREL)
--  # Disabled until googletest (gmock) stops emitting variadic macro warnings
--  #add_cxx_compiler_flag(-pedantic)
--  #add_cxx_compiler_flag(-pedantic-errors)
-+  if (NOT BENCHMARK_ENABLE_TESTING)
-+    # Disable warning when compiling tests as gtest does not use 'override'.
-+    add_cxx_compiler_flag(-Wsuggest-override)
-+  endif()
-+  add_cxx_compiler_flag(-pedantic)
-+  add_cxx_compiler_flag(-pedantic-errors)
-   add_cxx_compiler_flag(-Wshorten-64-to-32)
-   add_cxx_compiler_flag(-fstrict-aliasing)
-   # Disable warnings regarding deprecated parts of the library while building
-@@ -194,6 +216,7 @@ else()
-   # Link time optimisation
-   if (BENCHMARK_ENABLE_LTO)
-     add_cxx_compiler_flag(-flto)
-+    add_cxx_compiler_flag(-Wno-lto-type-mismatch)
-     if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
-       find_program(GCC_AR gcc-ar)
-       if (GCC_AR)
-@@ -245,11 +268,17 @@ if (BENCHMARK_USE_LIBCXX)
-   endif()
- endif(BENCHMARK_USE_LIBCXX)
- 
-+set(EXTRA_CXX_FLAGS "")
-+if (WIN32 AND "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-+  # Clang on Windows fails to compile the regex feature check under C++11
-+  set(EXTRA_CXX_FLAGS "-DCMAKE_CXX_STANDARD=14")
-+endif()
-+
- # C++ feature checks
- # Determine the correct regular expression engine to use
--cxx_feature_check(STD_REGEX)
--cxx_feature_check(GNU_POSIX_REGEX)
--cxx_feature_check(POSIX_REGEX)
-+cxx_feature_check(STD_REGEX ${EXTRA_CXX_FLAGS})
-+cxx_feature_check(GNU_POSIX_REGEX ${EXTRA_CXX_FLAGS})
-+cxx_feature_check(POSIX_REGEX ${EXTRA_CXX_FLAGS})
- if(NOT HAVE_STD_REGEX AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
-   message(FATAL_ERROR "Failed to determine the source files for the regular expression backend")
- endif()
-@@ -257,11 +286,16 @@ if (NOT BENCHMARK_ENABLE_EXCEPTIONS AND HAVE_STD_REGEX
-         AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
-   message(WARNING "Using std::regex with exceptions disabled is not fully supported")
- endif()
-+
- cxx_feature_check(STEADY_CLOCK)
- # Ensure we have pthreads
- set(THREADS_PREFER_PTHREAD_FLAG ON)
- find_package(Threads REQUIRED)
- 
-+if (BENCHMARK_ENABLE_LIBPFM)
-+  find_package(PFM)
-+endif()
-+
- # Set up directories
- include_directories(${PROJECT_SOURCE_DIR}/include)
- 
-diff --git a/lib/benchmark/COMMIT b/lib/benchmark/COMMIT
-index 7ca3e2e4ca..2d75860141 100644
---- a/lib/benchmark/COMMIT
-+++ b/lib/benchmark/COMMIT
-@@ -1 +1 @@
--e30cac6b06cf05416a9358df8be868ab01602a68
-+f91b6b42b1b9854772a90ae9501464a161707d1e
-diff --git a/lib/benchmark/CONTRIBUTING.md b/lib/benchmark/CONTRIBUTING.md
-new file mode 100644
-index 0000000000..43de4c9d47
---- /dev/null
-+++ b/lib/benchmark/CONTRIBUTING.md
-@@ -0,0 +1,58 @@
-+# How to contribute #
-+
-+We'd love to accept your patches and contributions to this project.  There are
-+a just a few small guidelines you need to follow.
-+
-+
-+## Contributor License Agreement ##
-+
-+Contributions to any Google project must be accompanied by a Contributor
-+License Agreement.  This is not a copyright **assignment**, it simply gives
-+Google permission to use and redistribute your contributions as part of the
-+project.
-+
-+  * If you are an individual writing original source code and you're sure you
-+    own the intellectual property, then you'll need to sign an [individual
-+    CLA][].
-+
-+  * If you work for a company that wants to allow you to contribute your work,
-+    then you'll need to sign a [corporate CLA][].
-+
-+You generally only need to submit a CLA once, so if you've already submitted
-+one (even if it was for a different project), you probably don't need to do it
-+again.
-+
-+[individual CLA]: https://developers.google.com/open-source/cla/individual
-+[corporate CLA]: https://developers.google.com/open-source/cla/corporate
-+
-+Once your CLA is submitted (or if you already submitted one for
-+another Google project), make a commit adding yourself to the
-+[AUTHORS][] and [CONTRIBUTORS][] files. This commit can be part
-+of your first [pull request][].
-+
-+[AUTHORS]: AUTHORS
-+[CONTRIBUTORS]: CONTRIBUTORS
-+
-+
-+## Submitting a patch ##
-+
-+  1. It's generally best to start by opening a new issue describing the bug or
-+     feature you're intending to fix.  Even if you think it's relatively minor,
-+     it's helpful to know what people are working on.  Mention in the initial
-+     issue that you are planning to work on that bug or feature so that it can
-+     be assigned to you.
-+
-+  1. Follow the normal process of [forking][] the project, and setup a new
-+     branch to work in.  It's important that each group of changes be done in
-+     separate branches in order to ensure that a pull request only includes the
-+     commits related to that bug or feature.
-+
-+  1. Do your best to have [well-formed commit messages][] for each change.
-+     This provides consistency throughout the project, and ensures that commit
-+     messages are able to be formatted properly by various git tools.
-+
-+  1. Finally, push the commits to your fork and submit a [pull request][].
-+
-+[forking]: https://help.github.com/articles/fork-a-repo
-+[well-formed commit messages]: http://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html
-+[pull request]: https://help.github.com/articles/creating-a-pull-request
-diff --git a/lib/benchmark/CONTRIBUTORS b/lib/benchmark/CONTRIBUTORS
-index 88f7eee06c..7489731de5 100644
---- a/lib/benchmark/CONTRIBUTORS
-+++ b/lib/benchmark/CONTRIBUTORS
-@@ -22,12 +22,14 @@
- #
- # Please keep the list sorted.
- 
-+Abhina Sreeskantharajan <abhina.sreeskantharajan@ibm.com>
- Albert Pretorius <pretoalb@gmail.com>
- Alex Steele <steelal123@gmail.com>
- Andriy Berestovskyy <berestovskyy@gmail.com>
- Arne Beer <arne@twobeer.de>
- Billy Robert O'Neal III <billy.oneal@gmail.com> <bion@microsoft.com>
- Chris Kennelly <ckennelly@google.com> <ckennelly@ckennelly.com>
-+Christian Wassermann <christian_wassermann@web.de>
- Christopher Seymour <chris.j.seymour@hotmail.com>
- Colin Braley <braley.colin@gmail.com>
- Cyrille Faucheux <cyrille.faucheux@gmail.com>
-@@ -40,6 +42,7 @@ Eric Backus <eric_backus@alum.mit.edu>
- Eric Fiselier <eric@efcs.ca>
- Eugene Zhuk <eugene.zhuk@gmail.com>
- Evgeny Safronov <division494@gmail.com>
-+Fanbo Meng <fanbo.meng@ibm.com>
- Federico Ficarelli <federico.ficarelli@gmail.com>
- Felix Homann <linuxaudio@showlabor.de>
- Geoffrey Martin-Noble <gcmn@google.com> <gmngeoffrey@gmail.com>
-@@ -59,6 +62,7 @@ Lei Xu <eddyxu@gmail.com>
- Matt Clarkson <mattyclarkson@gmail.com>
- Maxim Vafin <maxvafin@gmail.com>
- Nick Hutchinson <nshutchinson@gmail.com>
-+Norman Heino <norman.heino@gmail.com>
- Oleksandr Sochka <sasha.sochka@gmail.com>
- Ori Livneh <ori.livneh@gmail.com>
- Pascal Leroy <phl@google.com>
-@@ -71,8 +75,11 @@ Robert Guo <robert.guo@mongodb.com>
- Roman Lebedev <lebedev.ri@gmail.com>
- Sayan Bhattacharjee <aero.sayan@gmail.com>
- Shuo Chen <chenshuo@chenshuo.com>
-+Steven Wan <wan.yu@ibm.com>
-+Tobias Schmidt <tobias.schmidt@in.tum.de>
- Tobias Ulvgård <tobias.ulvgard@dirac.se>
- Tom Madams <tom.ej.madams@gmail.com> <tmadams@google.com>
- Yixuan Qiu <yixuanq@gmail.com>
- Yusuke Suzuki <utatane.tea@gmail.com>
- Zbigniew Skowron <zbychs@gmail.com>
-+Min-Yih Hsu <yihshyng223@gmail.com>
-diff --git a/lib/benchmark/README.md b/lib/benchmark/README.md
-new file mode 100644
-index 0000000000..7b81d960fc
---- /dev/null
-+++ b/lib/benchmark/README.md
-@@ -0,0 +1,216 @@
-+# Benchmark
-+
-+[![build-and-test](https://github.com/google/benchmark/workflows/build-and-test/badge.svg)](https://github.com/google/benchmark/actions?query=workflow%3Abuild-and-test)
-+[![bazel](https://github.com/google/benchmark/actions/workflows/bazel.yml/badge.svg)](https://github.com/google/benchmark/actions/workflows/bazel.yml)
-+[![pylint](https://github.com/google/benchmark/workflows/pylint/badge.svg)](https://github.com/google/benchmark/actions?query=workflow%3Apylint)
-+[![test-bindings](https://github.com/google/benchmark/workflows/test-bindings/badge.svg)](https://github.com/google/benchmark/actions?query=workflow%3Atest-bindings)
-+
-+[![Build Status](https://travis-ci.org/google/benchmark.svg?branch=master)](https://travis-ci.org/google/benchmark)
-+[![Coverage Status](https://coveralls.io/repos/google/benchmark/badge.svg)](https://coveralls.io/r/google/benchmark)
-+
-+
-+A library to benchmark code snippets, similar to unit tests. Example:
-+
-+```c++
-+#include <benchmark/benchmark.h>
-+
-+static void BM_SomeFunction(benchmark::State& state) {
-+  // Perform setup here
-+  for (auto _ : state) {
-+    // This code gets timed
-+    SomeFunction();
-+  }
-+}
-+// Register the function as a benchmark
-+BENCHMARK(BM_SomeFunction);
-+// Run the benchmark
-+BENCHMARK_MAIN();
-+```
-+
-+## Getting Started
-+
-+To get started, see [Requirements](#requirements) and
-+[Installation](#installation). See [Usage](#usage) for a full example and the
-+[User Guide](docs/user_guide.md) for a more comprehensive feature overview.
-+
-+It may also help to read the [Google Test documentation](https://github.com/google/googletest/blob/master/docs/primer.md)
-+as some of the structural aspects of the APIs are similar.
-+
-+## Resources
-+
-+[Discussion group](https://groups.google.com/d/forum/benchmark-discuss)
-+
-+IRC channels:
-+* [libera](https://libera.chat) #benchmark
-+
-+[Additional Tooling Documentation](docs/tools.md)
-+
-+[Assembly Testing Documentation](docs/AssemblyTests.md)
-+
-+## Requirements
-+
-+The library can be used with C++03. However, it requires C++11 to build,
-+including compiler and standard library support.
-+
-+The following minimum versions are required to build the library:
-+
-+* GCC 4.8
-+* Clang 3.4
-+* Visual Studio 14 2015
-+* Intel 2015 Update 1
-+
-+See [Platform-Specific Build Instructions](docs/platform_specific_build_instructions.md).
-+
-+## Installation
-+
-+This describes the installation process using cmake. As pre-requisites, you'll
-+need git and cmake installed.
-+
-+_See [dependencies.md](docs/dependencies.md) for more details regarding supported
-+versions of build tools._
-+
-+```bash
-+# Check out the library.
-+$ git clone https://github.com/google/benchmark.git
-+# Go to the library root directory
-+$ cd benchmark
-+# Make a build directory to place the build output.
-+$ cmake -E make_directory "build"
-+# Generate build system files with cmake, and download any dependencies.
-+$ cmake -E chdir "build" cmake -DBENCHMARK_DOWNLOAD_DEPENDENCIES=on -DCMAKE_BUILD_TYPE=Release ../
-+# or, starting with CMake 3.13, use a simpler form:
-+# cmake -DCMAKE_BUILD_TYPE=Release -S . -B "build"
-+# Build the library.
-+$ cmake --build "build" --config Release
-+```
-+This builds the `benchmark` and `benchmark_main` libraries and tests.
-+On a unix system, the build directory should now look something like this:
-+
-+```
-+/benchmark
-+  /build
-+    /src
-+      /libbenchmark.a
-+      /libbenchmark_main.a
-+    /test
-+      ...
-+```
-+
-+Next, you can run the tests to check the build.
-+
-+```bash
-+$ cmake -E chdir "build" ctest --build-config Release
-+```
-+
-+If you want to install the library globally, also run:
-+
-+```
-+sudo cmake --build "build" --config Release --target install
-+```
-+
-+Note that Google Benchmark requires Google Test to build and run the tests. This
-+dependency can be provided two ways:
-+
-+* Checkout the Google Test sources into `benchmark/googletest`.
-+* Otherwise, if `-DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON` is specified during
-+  configuration as above, the library will automatically download and build
-+  any required dependencies.
-+
-+If you do not wish to build and run the tests, add `-DBENCHMARK_ENABLE_GTEST_TESTS=OFF`
-+to `CMAKE_ARGS`.
-+
-+### Debug vs Release
-+
-+By default, benchmark builds as a debug library. You will see a warning in the
-+output when this is the case. To build it as a release library instead, add
-+`-DCMAKE_BUILD_TYPE=Release` when generating the build system files, as shown
-+above. The use of `--config Release` in build commands is needed to properly
-+support multi-configuration tools (like Visual Studio for example) and can be
-+skipped for other build systems (like Makefile).
-+
-+To enable link-time optimisation, also add `-DBENCHMARK_ENABLE_LTO=true` when
-+generating the build system files.
-+
-+If you are using gcc, you might need to set `GCC_AR` and `GCC_RANLIB` cmake
-+cache variables, if autodetection fails.
-+
-+If you are using clang, you may need to set `LLVMAR_EXECUTABLE`,
-+`LLVMNM_EXECUTABLE` and `LLVMRANLIB_EXECUTABLE` cmake cache variables.
-+
-+### Stable and Experimental Library Versions
-+
-+The main branch contains the latest stable version of the benchmarking library;
-+the API of which can be considered largely stable, with source breaking changes
-+being made only upon the release of a new major version.
-+
-+Newer, experimental, features are implemented and tested on the
-+[`v2` branch](https://github.com/google/benchmark/tree/v2). Users who wish
-+to use, test, and provide feedback on the new features are encouraged to try
-+this branch. However, this branch provides no stability guarantees and reserves
-+the right to change and break the API at any time.
-+
-+## Usage
-+
-+### Basic usage
-+
-+Define a function that executes the code to measure, register it as a benchmark
-+function using the `BENCHMARK` macro, and ensure an appropriate `main` function
-+is available:
-+
-+```c++
-+#include <benchmark/benchmark.h>
-+
-+static void BM_StringCreation(benchmark::State& state) {
-+  for (auto _ : state)
-+    std::string empty_string;
-+}
-+// Register the function as a benchmark
-+BENCHMARK(BM_StringCreation);
-+
-+// Define another benchmark
-+static void BM_StringCopy(benchmark::State& state) {
-+  std::string x = "hello";
-+  for (auto _ : state)
-+    std::string copy(x);
-+}
-+BENCHMARK(BM_StringCopy);
-+
-+BENCHMARK_MAIN();
-+```
-+
-+To run the benchmark, compile and link against the `benchmark` library
-+(libbenchmark.a/.so). If you followed the build steps above, this library will 
-+be under the build directory you created.
-+
-+```bash
-+# Example on linux after running the build steps above. Assumes the
-+# `benchmark` and `build` directories are under the current directory.
-+$ g++ mybenchmark.cc -std=c++11 -isystem benchmark/include \
-+  -Lbenchmark/build/src -lbenchmark -lpthread -o mybenchmark
-+```
-+
-+Alternatively, link against the `benchmark_main` library and remove
-+`BENCHMARK_MAIN();` above to get the same behavior.
-+
-+The compiled executable will run all benchmarks by default. Pass the `--help`
-+flag for option information or see the [User Guide](docs/user_guide.md).
-+
-+### Usage with CMake
-+
-+If using CMake, it is recommended to link against the project-provided
-+`benchmark::benchmark` and `benchmark::benchmark_main` targets using
-+`target_link_libraries`.
-+It is possible to use ```find_package``` to import an installed version of the
-+library.
-+```cmake
-+find_package(benchmark REQUIRED)
-+```
-+Alternatively, ```add_subdirectory``` will incorporate the library directly in
-+to one's CMake project.
-+```cmake
-+add_subdirectory(benchmark)
-+```
-+Either way, link to the library as follows.
-+```cmake
-+target_link_libraries(MyTarget benchmark::benchmark)
-+```
-diff --git a/lib/benchmark/cmake/AddCXXCompilerFlag.cmake b/lib/benchmark/cmake/AddCXXCompilerFlag.cmake
-index d0d2099814..858589e977 100644
---- a/lib/benchmark/cmake/AddCXXCompilerFlag.cmake
-+++ b/lib/benchmark/cmake/AddCXXCompilerFlag.cmake
-@@ -34,9 +34,11 @@ function(add_cxx_compiler_flag FLAG)
-   check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
-   set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
-   if(${MANGLED_FLAG})
--    set(VARIANT ${ARGV1})
--    if(ARGV1)
-+    if(ARGC GREATER 1)
-+      set(VARIANT ${ARGV1})
-       string(TOUPPER "_${VARIANT}" VARIANT)
-+    else()
-+      set(VARIANT "")
-     endif()
-     set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${BENCHMARK_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
-   endif()
-@@ -49,9 +51,11 @@ function(add_required_cxx_compiler_flag FLAG)
-   check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
-   set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
-   if(${MANGLED_FLAG})
--    set(VARIANT ${ARGV1})
--    if(ARGV1)
-+    if(ARGC GREATER 1)
-+      set(VARIANT ${ARGV1})
-       string(TOUPPER "_${VARIANT}" VARIANT)
-+    else()
-+      set(VARIANT "")
-     endif()
-     set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
-     set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)
-diff --git a/lib/benchmark/cmake/CXXFeatureCheck.cmake b/lib/benchmark/cmake/CXXFeatureCheck.cmake
-index 059d510dd9..62e6741fe3 100644
---- a/lib/benchmark/cmake/CXXFeatureCheck.cmake
-+++ b/lib/benchmark/cmake/CXXFeatureCheck.cmake
-@@ -27,6 +27,11 @@ function(cxx_feature_check FILE)
-     return()
-   endif()
- 
-+  if (ARGC GREATER 1)
-+    message(STATUS "Enabling additional flags: ${ARGV1}")
-+    list(APPEND BENCHMARK_CXX_LINKER_FLAGS ${ARGV1})
-+  endif()
-+
-   if (NOT DEFINED COMPILE_${FEATURE})
-     message(STATUS "Performing Test ${FEATURE}")
-     if(CMAKE_CROSSCOMPILING)
-diff --git a/lib/benchmark/cmake/GetGitVersion.cmake b/lib/benchmark/cmake/GetGitVersion.cmake
-index 4f10f226d7..04a1f9b70d 100644
---- a/lib/benchmark/cmake/GetGitVersion.cmake
-+++ b/lib/benchmark/cmake/GetGitVersion.cmake
-@@ -20,16 +20,20 @@ set(__get_git_version INCLUDED)
- 
- function(get_git_version var)
-   if(GIT_EXECUTABLE)
--      execute_process(COMMAND ${GIT_EXECUTABLE} describe --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8
-+      execute_process(COMMAND ${GIT_EXECUTABLE} describe --tags --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8
-           WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
-           RESULT_VARIABLE status
--          OUTPUT_VARIABLE GIT_VERSION
-+          OUTPUT_VARIABLE GIT_DESCRIBE_VERSION
-           ERROR_QUIET)
--      if(${status})
--          set(GIT_VERSION "v0.0.0")
-+      if(status)
-+          set(GIT_DESCRIBE_VERSION "v0.0.0")
-+      endif()
-+      
-+      string(STRIP ${GIT_DESCRIBE_VERSION} GIT_DESCRIBE_VERSION)
-+      if(GIT_DESCRIBE_VERSION MATCHES v[^-]*-) 
-+         string(REGEX REPLACE "v([^-]*)-([0-9]+)-.*" "\\1.\\2"  GIT_VERSION ${GIT_DESCRIBE_VERSION})
-       else()
--          string(STRIP ${GIT_VERSION} GIT_VERSION)
--          string(REGEX REPLACE "-[0-9]+-g" "-" GIT_VERSION ${GIT_VERSION})
-+         string(REGEX REPLACE "v(.*)" "\\1" GIT_VERSION ${GIT_DESCRIBE_VERSION})
-       endif()
- 
-       # Work out if the repository is dirty
-@@ -43,12 +47,12 @@ function(get_git_version var)
-           ERROR_QUIET)
-       string(COMPARE NOTEQUAL "${GIT_DIFF_INDEX}" "" GIT_DIRTY)
-       if (${GIT_DIRTY})
--          set(GIT_VERSION "${GIT_VERSION}-dirty")
-+          set(GIT_DESCRIBE_VERSION "${GIT_DESCRIBE_VERSION}-dirty")
-       endif()
-+      message(STATUS "git version: ${GIT_DESCRIBE_VERSION} normalized to ${GIT_VERSION}")
-   else()
--      set(GIT_VERSION "v0.0.0")
-+      set(GIT_VERSION "0.0.0")
-   endif()
- 
--  message(STATUS "git Version: ${GIT_VERSION}")
-   set(${var} ${GIT_VERSION} PARENT_SCOPE)
- endfunction()
-diff --git a/lib/benchmark/cmake/GoogleTest.cmake b/lib/benchmark/cmake/GoogleTest.cmake
-index dd611fc875..305eb8df7c 100644
---- a/lib/benchmark/cmake/GoogleTest.cmake
-+++ b/lib/benchmark/cmake/GoogleTest.cmake
-@@ -29,6 +29,9 @@ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
- 
- include(${GOOGLETEST_PREFIX}/googletest-paths.cmake)
- 
-+# googletest doesn't seem to want to stay build warning clean so let's not hurt ourselves.
-+add_compile_options(-w)
-+
- # Add googletest directly to our build. This defines
- # the gtest and gtest_main targets.
- add_subdirectory(${GOOGLETEST_SOURCE_DIR}
-diff --git a/lib/benchmark/cmake/GoogleTest.cmake.in b/lib/benchmark/cmake/GoogleTest.cmake.in
-index 28818ee293..fd957ff564 100644
---- a/lib/benchmark/cmake/GoogleTest.cmake.in
-+++ b/lib/benchmark/cmake/GoogleTest.cmake.in
-@@ -31,7 +31,7 @@ if(EXISTS "${GOOGLETEST_PATH}"            AND IS_DIRECTORY "${GOOGLETEST_PATH}"
-   )
- else()
-   if(NOT ALLOW_DOWNLOADING_GOOGLETEST)
--    message(SEND_ERROR "Did not find Google Test sources! Either pass correct path in GOOGLETEST_PATH, or enable ALLOW_DOWNLOADING_GOOGLETEST, or disable BENCHMARK_ENABLE_GTEST_TESTS / BENCHMARK_ENABLE_TESTING.")
-+    message(SEND_ERROR "Did not find Google Test sources! Either pass correct path in GOOGLETEST_PATH, or enable BENCHMARK_DOWNLOAD_DEPENDENCIES, or disable BENCHMARK_ENABLE_GTEST_TESTS / BENCHMARK_ENABLE_TESTING.")
-   else()
-     message(WARNING "Did not find Google Test sources! Fetching from web...")
-     ExternalProject_Add(
-diff --git a/lib/benchmark/cmake/Modules/FindPFM.cmake b/lib/benchmark/cmake/Modules/FindPFM.cmake
-new file mode 100644
-index 0000000000..cf807a1ee9
---- /dev/null
-+++ b/lib/benchmark/cmake/Modules/FindPFM.cmake
-@@ -0,0 +1,26 @@
-+# If successful, the following variables will be defined:
-+# HAVE_LIBPFM.
-+# Set BENCHMARK_ENABLE_LIBPFM to 0 to disable, regardless of libpfm presence.
-+include(CheckIncludeFile)
-+include(CheckLibraryExists)
-+include(FeatureSummary)
-+enable_language(C)
-+
-+set_package_properties(PFM PROPERTIES
-+                       URL http://perfmon2.sourceforge.net/
-+                       DESCRIPTION "a helper library to develop monitoring tools"
-+                       PURPOSE "Used to program specific performance monitoring events")
-+
-+check_library_exists(libpfm.a pfm_initialize "" HAVE_LIBPFM_INITIALIZE)
-+if(HAVE_LIBPFM_INITIALIZE)
-+  check_include_file(perfmon/perf_event.h HAVE_PERFMON_PERF_EVENT_H)
-+  check_include_file(perfmon/pfmlib.h HAVE_PERFMON_PFMLIB_H)
-+  check_include_file(perfmon/pfmlib_perf_event.h HAVE_PERFMON_PFMLIB_PERF_EVENT_H)
-+  if(HAVE_PERFMON_PERF_EVENT_H AND HAVE_PERFMON_PFMLIB_H AND HAVE_PERFMON_PFMLIB_PERF_EVENT_H)
-+    message("Using Perf Counters.")
-+    set(HAVE_LIBPFM 1)
-+    set(PFM_FOUND 1)
-+  endif()
-+else()
-+  message("Perf Counters support requested, but was unable to find libpfm.")
-+endif()
-diff --git a/lib/benchmark/cmake/benchmark.pc.in b/lib/benchmark/cmake/benchmark.pc.in
-index 43ca8f91d7..34beb012ee 100644
---- a/lib/benchmark/cmake/benchmark.pc.in
-+++ b/lib/benchmark/cmake/benchmark.pc.in
-@@ -1,7 +1,7 @@
- prefix=@CMAKE_INSTALL_PREFIX@
- exec_prefix=${prefix}
--libdir=${prefix}/lib
--includedir=${prefix}/include
-+libdir=${prefix}/@CMAKE_INSTALL_LIBDIR@
-+includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
- 
- Name: @PROJECT_NAME@
- Description: Google microbenchmark framework
-diff --git a/lib/benchmark/include/benchmark/benchmark.h b/lib/benchmark/include/benchmark/benchmark.h
-index e5f6778958..fbb5340ce8 100644
---- a/lib/benchmark/include/benchmark/benchmark.h
-+++ b/lib/benchmark/include/benchmark/benchmark.h
-@@ -42,6 +42,7 @@ BENCHMARK(BM_StringCopy);
- int main(int argc, char** argv) {
-   benchmark::Initialize(&argc, argv);
-   benchmark::RunSpecifiedBenchmarks();
-+  benchmark::Shutdown();
-   return 0;
- }
- 
-@@ -139,13 +140,13 @@ thread exits the loop body. As such, any global setup or teardown you want to
- do can be wrapped in a check against the thread index:
- 
- static void BM_MultiThreaded(benchmark::State& state) {
--  if (state.thread_index == 0) {
-+  if (state.thread_index() == 0) {
-     // Setup code here.
-   }
-   for (auto _ : state) {
-     // Run the test as normal.
-   }
--  if (state.thread_index == 0) {
-+  if (state.thread_index() == 0) {
-     // Teardown code here.
-   }
- }
-@@ -167,6 +168,12 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
- #define BENCHMARK_HAS_CXX11
- #endif
- 
-+// This _MSC_VER check should detect VS 2017 v15.3 and newer.
-+#if __cplusplus >= 201703L || \
-+    (defined(_MSC_VER) && _MSC_VER >= 1911 && _MSVC_LANG >= 201703L)
-+#define BENCHMARK_HAS_CXX17
-+#endif
-+
- #include <stdint.h>
- 
- #include <algorithm>
-@@ -176,9 +183,11 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
- #include <map>
- #include <set>
- #include <string>
-+#include <utility>
- #include <vector>
- 
- #if defined(BENCHMARK_HAS_CXX11)
-+#include <atomic>
- #include <initializer_list>
- #include <type_traits>
- #include <utility>
-@@ -198,13 +207,19 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
-   TypeName& operator=(const TypeName&) = delete
- #endif
- 
--#if defined(__GNUC__)
-+#ifdef BENCHMARK_HAS_CXX17
-+#define BENCHMARK_UNUSED [[maybe_unused]]
-+#elif defined(__GNUC__) || defined(__clang__)
- #define BENCHMARK_UNUSED __attribute__((unused))
-+#else
-+#define BENCHMARK_UNUSED
-+#endif
-+
-+#if defined(__GNUC__) || defined(__clang__)
- #define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline))
- #define BENCHMARK_NOEXCEPT noexcept
- #define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
- #elif defined(_MSC_VER) && !defined(__clang__)
--#define BENCHMARK_UNUSED
- #define BENCHMARK_ALWAYS_INLINE __forceinline
- #if _MSC_VER >= 1900
- #define BENCHMARK_NOEXCEPT noexcept
-@@ -215,7 +230,6 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
- #endif
- #define __func__ __FUNCTION__
- #else
--#define BENCHMARK_UNUSED
- #define BENCHMARK_ALWAYS_INLINE
- #define BENCHMARK_NOEXCEPT
- #define BENCHMARK_NOEXCEPT_OP(x)
-@@ -251,11 +265,18 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
- #define BENCHMARK_UNREACHABLE() ((void)0)
- #endif
- 
-+#ifdef BENCHMARK_HAS_CXX11
-+#define BENCHMARK_OVERRIDE override
-+#else
-+#define BENCHMARK_OVERRIDE
-+#endif
-+
- namespace benchmark {
- class BenchmarkReporter;
- class MemoryManager;
- 
- void Initialize(int* argc, char** argv);
-+void Shutdown();
- 
- // Report to stdout all arguments in 'argv' as unrecognized except the first.
- // Returns true there is at least on unrecognized argument (i.e. 'argc' > 1).
-@@ -282,6 +303,9 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
- // allocation measurements for benchmark runs.
- void RegisterMemoryManager(MemoryManager* memory_manager);
- 
-+// Add a key-value pair to output as part of the context stanza in the report.
-+void AddCustomContext(const std::string& key, const std::string& value);
-+
- namespace internal {
- class Benchmark;
- class BenchmarkImp;
-@@ -304,6 +328,14 @@ BENCHMARK_UNUSED static int stream_init_anchor = InitializeStreams();
- #define BENCHMARK_HAS_NO_INLINE_ASSEMBLY
- #endif
- 
-+// Force the compiler to flush pending writes to global memory. Acts as an
-+// effective read/write barrier
-+#ifdef BENCHMARK_HAS_CXX11
-+inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
-+  std::atomic_signal_fence(std::memory_order_acq_rel);
-+}
-+#endif
-+
- // The DoNotOptimize(...) function can be used to prevent a value or
- // expression from being optimized away by the compiler. This function is
- // intended to add little to no overhead.
-@@ -323,11 +355,11 @@ inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp& value) {
- #endif
- }
- 
--// Force the compiler to flush pending writes to global memory. Acts as an
--// effective read/write barrier
-+#ifndef BENCHMARK_HAS_CXX11
- inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
-   asm volatile("" : : : "memory");
- }
-+#endif
- #elif defined(_MSC_VER)
- template <class Tp>
- inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
-@@ -335,13 +367,15 @@ inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
-   _ReadWriteBarrier();
- }
- 
-+#ifndef BENCHMARK_HAS_CXX11
- inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() { _ReadWriteBarrier(); }
-+#endif
- #else
- template <class Tp>
- inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
-   internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
- }
--// FIXME Add ClobberMemory() for non-gnu and non-msvc compilers
-+// FIXME Add ClobberMemory() for non-gnu and non-msvc compilers, before C++11.
- #endif
- 
- // This class is used for user-defined counters.
-@@ -406,7 +440,7 @@ typedef std::map<std::string, Counter> UserCounters;
- 
- // TimeUnit is passed to a benchmark in order to specify the order of magnitude
- // for the measured time.
--enum TimeUnit { kNanosecond, kMicrosecond, kMillisecond };
-+enum TimeUnit { kNanosecond, kMicrosecond, kMillisecond, kSecond };
- 
- // BigO is passed to a benchmark in order to specify the asymptotic
- // computational
-@@ -416,6 +450,8 @@ enum BigO { oNone, o1, oN, oNSquared, oNCubed, oLogN, oNLogN, oAuto, oLambda };
- 
- typedef uint64_t IterationCount;
- 
-+enum StatisticUnit { kTime, kPercentage };
-+
- // BigOFunc is passed to a benchmark in order to specify the asymptotic
- // computational complexity for the benchmark.
- typedef double(BigOFunc)(IterationCount);
-@@ -428,14 +464,17 @@ namespace internal {
- struct Statistics {
-   std::string name_;
-   StatisticsFunc* compute_;
-+  StatisticUnit unit_;
- 
--  Statistics(const std::string& name, StatisticsFunc* compute)
--      : name_(name), compute_(compute) {}
-+  Statistics(const std::string& name, StatisticsFunc* compute,
-+             StatisticUnit unit = kTime)
-+      : name_(name), compute_(compute), unit_(unit) {}
- };
- 
--struct BenchmarkInstance;
-+class BenchmarkInstance;
- class ThreadTimer;
- class ThreadManager;
-+class PerfCountersMeasurement;
- 
- enum AggregationReportMode
- #if defined(BENCHMARK_HAS_CXX11)
-@@ -632,6 +671,14 @@ class State {
-   BENCHMARK_DEPRECATED_MSG("use 'range(1)' instead")
-   int64_t range_y() const { return range(1); }
- 
-+  // Number of threads concurrently executing the benchmark.
-+  BENCHMARK_ALWAYS_INLINE
-+  int threads() const { return threads_; }
-+
-+  // Index of the executing thread. Values from [0, threads).
-+  BENCHMARK_ALWAYS_INLINE
-+  int thread_index() const { return thread_index_; }
-+
-   BENCHMARK_ALWAYS_INLINE
-   IterationCount iterations() const {
-     if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) {
-@@ -640,8 +687,8 @@ class State {
-     return max_iterations - total_iterations_ + batch_leftover_;
-   }
- 
-- private
--     :  // items we expect on the first cache line (ie 64 bytes of the struct)
-+ private:
-+  // items we expect on the first cache line (ie 64 bytes of the struct)
-   // When total_iterations_ is 0, KeepRunning() and friends will return false.
-   // May be larger than max_iterations.
-   IterationCount total_iterations_;
-@@ -667,25 +714,27 @@ class State {
-  public:
-   // Container for user-defined counters.
-   UserCounters counters;
--  // Index of the executing thread. Values from [0, threads).
--  const int thread_index;
--  // Number of threads concurrently executing the benchmark.
--  const int threads;
- 
-  private:
-   State(IterationCount max_iters, const std::vector<int64_t>& ranges,
-         int thread_i, int n_threads, internal::ThreadTimer* timer,
--        internal::ThreadManager* manager);
-+        internal::ThreadManager* manager,
-+        internal::PerfCountersMeasurement* perf_counters_measurement);
- 
-   void StartKeepRunning();
-   // Implementation of KeepRunning() and KeepRunningBatch().
-   // is_batch must be true unless n is 1.
-   bool KeepRunningInternal(IterationCount n, bool is_batch);
-   void FinishKeepRunning();
--  internal::ThreadTimer* timer_;
--  internal::ThreadManager* manager_;
- 
--  friend struct internal::BenchmarkInstance;
-+  const int thread_index_;
-+  const int threads_;
-+
-+  internal::ThreadTimer* const timer_;
-+  internal::ThreadManager* const manager_;
-+  internal::PerfCountersMeasurement* const perf_counters_measurement_;
-+
-+  friend class internal::BenchmarkInstance;
- };
- 
- inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunning() {
-@@ -789,6 +838,9 @@ class Benchmark {
-   // Note: the following methods all return "this" so that multiple
-   // method calls can be chained together in one expression.
- 
-+  // Specify the name of the benchmark
-+  Benchmark* Name(const std::string& name);
-+
-   // Run this benchmark once with "x" as the extra argument passed
-   // to the function.
-   // REQUIRES: The function passed to the constructor must accept an arg1.
-@@ -827,6 +879,11 @@ class Benchmark {
-   // REQUIRES: The function passed to the constructor must accept arg1, arg2 ...
-   Benchmark* Ranges(const std::vector<std::pair<int64_t, int64_t> >& ranges);
- 
-+  // Run this benchmark once for each combination of values in the (cartesian)
-+  // product of the supplied argument lists.
-+  // REQUIRES: The function passed to the constructor must accept arg1, arg2 ...
-+  Benchmark* ArgsProduct(const std::vector<std::vector<int64_t> >& arglists);
-+
-   // Equivalent to ArgNames({name})
-   Benchmark* ArgName(const std::string& name);
- 
-@@ -912,7 +969,8 @@ class Benchmark {
-   Benchmark* Complexity(BigOFunc* complexity);
- 
-   // Add this statistics to be computed over all the values of benchmark run
--  Benchmark* ComputeStatistics(std::string name, StatisticsFunc* statistics);
-+  Benchmark* ComputeStatistics(std::string name, StatisticsFunc* statistics,
-+                               StatisticUnit unit = kTime);
- 
-   // Support for running multiple copies of the same benchmark concurrently
-   // in multiple threads.  This may be useful when measuring the scaling
-@@ -955,6 +1013,7 @@ class Benchmark {
- 
-  private:
-   friend class BenchmarkFamilies;
-+  friend class BenchmarkInstance;
- 
-   std::string name_;
-   AggregationReportMode aggregation_report_mode_;
-@@ -1002,7 +1061,7 @@ class FunctionBenchmark : public Benchmark {
-   FunctionBenchmark(const char* name, Function* func)
-       : Benchmark(name), func_(func) {}
- 
--  virtual void Run(State& st);
-+  virtual void Run(State& st) BENCHMARK_OVERRIDE;
- 
-  private:
-   Function* func_;
-@@ -1012,7 +1071,7 @@ class FunctionBenchmark : public Benchmark {
- template <class Lambda>
- class LambdaBenchmark : public Benchmark {
-  public:
--  virtual void Run(State& st) { lambda_(st); }
-+  virtual void Run(State& st) BENCHMARK_OVERRIDE { lambda_(st); }
- 
-  private:
-   template <class OLambda>
-@@ -1064,7 +1123,7 @@ class Fixture : public internal::Benchmark {
-  public:
-   Fixture() : internal::Benchmark("") {}
- 
--  virtual void Run(State& st) {
-+  virtual void Run(State& st) BENCHMARK_OVERRIDE {
-     this->SetUp(st);
-     this->BenchmarkCase(st);
-     this->TearDown(st);
-@@ -1097,9 +1156,12 @@ class Fixture : public internal::Benchmark {
- 
- // Helpers for generating unique variable names
- #define BENCHMARK_PRIVATE_NAME(n) \
--  BENCHMARK_PRIVATE_CONCAT(_benchmark_, BENCHMARK_PRIVATE_UNIQUE_ID, n)
-+  BENCHMARK_PRIVATE_CONCAT(benchmark_uniq_, BENCHMARK_PRIVATE_UNIQUE_ID, n)
- #define BENCHMARK_PRIVATE_CONCAT(a, b, c) BENCHMARK_PRIVATE_CONCAT2(a, b, c)
- #define BENCHMARK_PRIVATE_CONCAT2(a, b, c) a##b##c
-+// Helper for concatenation with macro name expansion
-+#define BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method) \
-+    BaseClass##_##Method##_Benchmark
- 
- #define BENCHMARK_PRIVATE_DECLARE(n)                                 \
-   static ::benchmark::internal::Benchmark* BENCHMARK_PRIVATE_NAME(n) \
-@@ -1169,37 +1231,37 @@ class Fixture : public internal::Benchmark {
- #define BENCHMARK_TEMPLATE(n, a) BENCHMARK_TEMPLATE1(n, a)
- #endif
- 
--#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)        \
--  class BaseClass##_##Method##_Benchmark : public BaseClass { \
--   public:                                                    \
--    BaseClass##_##Method##_Benchmark() : BaseClass() {        \
--      this->SetName(#BaseClass "/" #Method);                  \
--    }                                                         \
--                                                              \
--   protected:                                                 \
--    virtual void BenchmarkCase(::benchmark::State&);          \
-+#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)                  \
-+  class BaseClass##_##Method##_Benchmark : public BaseClass {           \
-+   public:                                                              \
-+    BaseClass##_##Method##_Benchmark() : BaseClass() {                  \
-+      this->SetName(#BaseClass "/" #Method);                            \
-+    }                                                                   \
-+                                                                        \
-+   protected:                                                           \
-+    virtual void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE; \
-   };
- 
--#define BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a) \
--  class BaseClass##_##Method##_Benchmark : public BaseClass<a> {    \
--   public:                                                          \
--    BaseClass##_##Method##_Benchmark() : BaseClass<a>() {           \
--      this->SetName(#BaseClass "<" #a ">/" #Method);                \
--    }                                                               \
--                                                                    \
--   protected:                                                       \
--    virtual void BenchmarkCase(::benchmark::State&);                \
-+#define BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a)     \
-+  class BaseClass##_##Method##_Benchmark : public BaseClass<a> {        \
-+   public:                                                              \
-+    BaseClass##_##Method##_Benchmark() : BaseClass<a>() {               \
-+      this->SetName(#BaseClass "<" #a ">/" #Method);                    \
-+    }                                                                   \
-+                                                                        \
-+   protected:                                                           \
-+    virtual void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE; \
-   };
- 
--#define BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
--  class BaseClass##_##Method##_Benchmark : public BaseClass<a, b> {    \
--   public:                                                             \
--    BaseClass##_##Method##_Benchmark() : BaseClass<a, b>() {           \
--      this->SetName(#BaseClass "<" #a "," #b ">/" #Method);            \
--    }                                                                  \
--                                                                       \
--   protected:                                                          \
--    virtual void BenchmarkCase(::benchmark::State&);                   \
-+#define BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b)  \
-+  class BaseClass##_##Method##_Benchmark : public BaseClass<a, b> {     \
-+   public:                                                              \
-+    BaseClass##_##Method##_Benchmark() : BaseClass<a, b>() {            \
-+      this->SetName(#BaseClass "<" #a "," #b ">/" #Method);             \
-+    }                                                                   \
-+                                                                        \
-+   protected:                                                           \
-+    virtual void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE; \
-   };
- 
- #ifdef BENCHMARK_HAS_CXX11
-@@ -1211,7 +1273,7 @@ class Fixture : public internal::Benchmark {
-     }                                                                      \
-                                                                            \
-    protected:                                                              \
--    virtual void BenchmarkCase(::benchmark::State&);                       \
-+    virtual void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE;    \
-   };
- #else
- #define BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(n, a) \
-@@ -1220,27 +1282,27 @@ class Fixture : public internal::Benchmark {
- 
- #define BENCHMARK_DEFINE_F(BaseClass, Method)    \
-   BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
--  void BaseClass##_##Method##_Benchmark::BenchmarkCase
-+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
- 
- #define BENCHMARK_TEMPLATE1_DEFINE_F(BaseClass, Method, a)    \
-   BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a) \
--  void BaseClass##_##Method##_Benchmark::BenchmarkCase
-+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
- 
- #define BENCHMARK_TEMPLATE2_DEFINE_F(BaseClass, Method, a, b)    \
-   BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
--  void BaseClass##_##Method##_Benchmark::BenchmarkCase
-+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
- 
- #ifdef BENCHMARK_HAS_CXX11
- #define BENCHMARK_TEMPLATE_DEFINE_F(BaseClass, Method, ...)            \
-   BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, __VA_ARGS__) \
--  void BaseClass##_##Method##_Benchmark::BenchmarkCase
-+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
- #else
- #define BENCHMARK_TEMPLATE_DEFINE_F(BaseClass, Method, a) \
-   BENCHMARK_TEMPLATE1_DEFINE_F(BaseClass, Method, a)
- #endif
- 
- #define BENCHMARK_REGISTER_F(BaseClass, Method) \
--  BENCHMARK_PRIVATE_REGISTER_F(BaseClass##_##Method##_Benchmark)
-+  BENCHMARK_PRIVATE_REGISTER_F(BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method))
- 
- #define BENCHMARK_PRIVATE_REGISTER_F(TestName) \
-   BENCHMARK_PRIVATE_DECLARE(TestName) =        \
-@@ -1250,23 +1312,23 @@ class Fixture : public internal::Benchmark {
- #define BENCHMARK_F(BaseClass, Method)           \
-   BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
-   BENCHMARK_REGISTER_F(BaseClass, Method);       \
--  void BaseClass##_##Method##_Benchmark::BenchmarkCase
-+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
- 
- #define BENCHMARK_TEMPLATE1_F(BaseClass, Method, a)           \
-   BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a) \
-   BENCHMARK_REGISTER_F(BaseClass, Method);                    \
--  void BaseClass##_##Method##_Benchmark::BenchmarkCase
-+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
- 
- #define BENCHMARK_TEMPLATE2_F(BaseClass, Method, a, b)           \
-   BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
-   BENCHMARK_REGISTER_F(BaseClass, Method);                       \
--  void BaseClass##_##Method##_Benchmark::BenchmarkCase
-+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
- 
- #ifdef BENCHMARK_HAS_CXX11
- #define BENCHMARK_TEMPLATE_F(BaseClass, Method, ...)                   \
-   BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, __VA_ARGS__) \
-   BENCHMARK_REGISTER_F(BaseClass, Method);                             \
--  void BaseClass##_##Method##_Benchmark::BenchmarkCase
-+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
- #else
- #define BENCHMARK_TEMPLATE_F(BaseClass, Method, a) \
-   BENCHMARK_TEMPLATE1_F(BaseClass, Method, a)
-@@ -1278,6 +1340,8 @@ class Fixture : public internal::Benchmark {
-     ::benchmark::Initialize(&argc, argv);                               \
-     if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1; \
-     ::benchmark::RunSpecifiedBenchmarks();                              \
-+    ::benchmark::Shutdown();                                            \
-+    return 0;                                                           \
-   }                                                                     \
-   int main(int, char**)
- 
-@@ -1294,10 +1358,16 @@ struct CPUInfo {
-     int num_sharing;
-   };
- 
-+  enum Scaling {
-+    UNKNOWN,
-+    ENABLED,
-+    DISABLED
-+  };
-+
-   int num_cpus;
-+  Scaling scaling;
-   double cycles_per_second;
-   std::vector<CacheInfo> caches;
--  bool scaling_enabled;
-   std::vector<double> load_avg;
- 
-   static const CPUInfo& Get();
-@@ -1356,6 +1426,7 @@ class BenchmarkReporter {
- 
-     Run()
-         : run_type(RT_Iteration),
-+          aggregate_unit(kTime),
-           error_occurred(false),
-           iterations(1),
-           threads(1),
-@@ -1375,8 +1446,11 @@ class BenchmarkReporter {
- 
-     std::string benchmark_name() const;
-     BenchmarkName run_name;
-+    int64_t family_index;
-+    int64_t per_family_instance_index;
-     RunType run_type;
-     std::string aggregate_name;
-+    StatisticUnit aggregate_unit;
-     std::string report_label;  // Empty if not set by benchmark.
-     bool error_occurred;
-     std::string error_message;
-@@ -1424,6 +1498,19 @@ class BenchmarkReporter {
-     int64_t max_bytes_used;
-   };
- 
-+  struct PerFamilyRunReports {
-+    PerFamilyRunReports() : num_runs_total(0), num_runs_done(0) {}
-+
-+    // How many runs will all instances of this benchmark perform?
-+    int num_runs_total;
-+
-+    // How many runs have happened already?
-+    int num_runs_done;
-+
-+    // The reports about (non-errneous!) runs of this family.
-+    std::vector<BenchmarkReporter::Run> Runs;
-+  };
-+
-   // Construct a BenchmarkReporter with the output stream set to 'std::cout'
-   // and the error stream set to 'std::cerr'
-   BenchmarkReporter();
-@@ -1496,8 +1583,8 @@ class ConsoleReporter : public BenchmarkReporter {
-         prev_counters_(),
-         printed_header_(false) {}
- 
--  virtual bool ReportContext(const Context& context);
--  virtual void ReportRuns(const std::vector<Run>& reports);
-+  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
-+  virtual void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
- 
-  protected:
-   virtual void PrintRunData(const Run& report);
-@@ -1512,9 +1599,9 @@ class ConsoleReporter : public BenchmarkReporter {
- class JSONReporter : public BenchmarkReporter {
-  public:
-   JSONReporter() : first_report_(true) {}
--  virtual bool ReportContext(const Context& context);
--  virtual void ReportRuns(const std::vector<Run>& reports);
--  virtual void Finalize();
-+  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
-+  virtual void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
-+  virtual void Finalize() BENCHMARK_OVERRIDE;
- 
-  private:
-   void PrintRunData(const Run& report);
-@@ -1527,8 +1614,8 @@ class BENCHMARK_DEPRECATED_MSG(
-     : public BenchmarkReporter {
-  public:
-   CSVReporter() : printed_header_(false) {}
--  virtual bool ReportContext(const Context& context);
--  virtual void ReportRuns(const std::vector<Run>& reports);
-+  virtual bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
-+  virtual void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
- 
-  private:
-   void PrintRunData(const Run& report);
-@@ -1562,6 +1649,8 @@ class MemoryManager {
- 
- inline const char* GetTimeUnitString(TimeUnit unit) {
-   switch (unit) {
-+    case kSecond:
-+      return "s";
-     case kMillisecond:
-       return "ms";
-     case kMicrosecond:
-@@ -1574,6 +1663,8 @@ inline const char* GetTimeUnitString(TimeUnit unit) {
- 
- inline double GetTimeUnitMultiplier(TimeUnit unit) {
-   switch (unit) {
-+    case kSecond:
-+      return 1;
-     case kMillisecond:
-       return 1e3;
-     case kMicrosecond:
-@@ -1584,6 +1675,21 @@ inline double GetTimeUnitMultiplier(TimeUnit unit) {
-   BENCHMARK_UNREACHABLE();
- }
- 
-+// Creates a list of integer values for the given range and multiplier.
-+// This can be used together with ArgsProduct() to allow multiple ranges
-+// with different multiplers.
-+// Example:
-+// ArgsProduct({
-+//   CreateRange(0, 1024, /*multi=*/32),
-+//   CreateRange(0, 100, /*multi=*/4),
-+//   CreateDenseRange(0, 4, /*step=*/1),
-+// });
-+std::vector<int64_t> CreateRange(int64_t lo, int64_t hi, int multi);
-+
-+// Creates a list of integer values for the given range and step.
-+std::vector<int64_t> CreateDenseRange(int64_t start, int64_t limit,
-+                                      int step);
-+
- }  // namespace benchmark
- 
- #endif  // BENCHMARK_BENCHMARK_H_
-diff --git a/lib/benchmark/src/CMakeLists.txt b/lib/benchmark/src/CMakeLists.txt
-index 28acc92dde..1df8a4aa8e 100644
---- a/lib/benchmark/src/CMakeLists.txt
-+++ b/lib/benchmark/src/CMakeLists.txt
-@@ -1,7 +1,3 @@
--if(POLICY CMP0069)
--  cmake_policy(SET CMP0069 NEW)
--endif()
--
- # Allow the source files to find headers in src/
- include(GNUInstallDirs)
- include_directories(${PROJECT_SOURCE_DIR}/src)
-@@ -32,6 +28,12 @@ target_include_directories(benchmark PUBLIC
-     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
-     )
- 
-+# libpfm, if available
-+if (HAVE_LIBPFM)
-+  target_link_libraries(benchmark libpfm.a)
-+  add_definitions(-DHAVE_LIBPFM)
-+endif()
-+
- # Link threads.
- target_link_libraries(benchmark  ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
- find_library(LIBRT rt)
-@@ -115,4 +117,8 @@ if (BENCHMARK_ENABLE_INSTALL)
-       EXPORT "${targets_export_name}"
-       NAMESPACE "${namespace}"
-       DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
-+
-+  install(
-+    DIRECTORY "${PROJECT_SOURCE_DIR}/docs/"
-+    DESTINATION "${CMAKE_INSTALL_PREFIX}/share/doc/${PROJECT_NAME}")
- endif()
-diff --git a/lib/benchmark/src/benchmark.cc b/lib/benchmark/src/benchmark.cc
-index b751b9c31f..a086453a94 100644
---- a/lib/benchmark/src/benchmark.cc
-+++ b/lib/benchmark/src/benchmark.cc
-@@ -13,6 +13,7 @@
- // limitations under the License.
- 
- #include "benchmark/benchmark.h"
-+
- #include "benchmark_api_internal.h"
- #include "benchmark_runner.h"
- #include "internal_macros.h"
-@@ -32,7 +33,10 @@
- #include <cstdlib>
- #include <fstream>
- #include <iostream>
-+#include <limits>
-+#include <map>
- #include <memory>
-+#include <random>
- #include <string>
- #include <thread>
- #include <utility>
-@@ -45,71 +49,85 @@
- #include "internal_macros.h"
- #include "log.h"
- #include "mutex.h"
-+#include "perf_counters.h"
- #include "re.h"
- #include "statistics.h"
- #include "string_util.h"
- #include "thread_manager.h"
- #include "thread_timer.h"
- 
-+namespace benchmark {
- // Print a list of benchmarks. This option overrides all other options.
--DEFINE_bool(benchmark_list_tests, false);
-+BM_DEFINE_bool(benchmark_list_tests, false);
- 
- // A regular expression that specifies the set of benchmarks to execute.  If
- // this flag is empty, or if this flag is the string \"all\", all benchmarks
- // linked into the binary are run.
--DEFINE_string(benchmark_filter, ".");
-+BM_DEFINE_string(benchmark_filter, "");
- 
- // Minimum number of seconds we should run benchmark before results are
- // considered significant.  For cpu-time based tests, this is the lower bound
- // on the total cpu time used by all threads that make up the test.  For
- // real-time based tests, this is the lower bound on the elapsed time of the
- // benchmark execution, regardless of number of threads.
--DEFINE_double(benchmark_min_time, 0.5);
-+BM_DEFINE_double(benchmark_min_time, 0.5);
- 
- // The number of runs of each benchmark. If greater than 1, the mean and
- // standard deviation of the runs will be reported.
--DEFINE_int32(benchmark_repetitions, 1);
-+BM_DEFINE_int32(benchmark_repetitions, 1);
-+
-+// If set, enable random interleaving of repetitions of all benchmarks.
-+// See http://github.com/google/benchmark/issues/1051 for details.
-+BM_DEFINE_bool(benchmark_enable_random_interleaving, false);
- 
- // Report the result of each benchmark repetitions. When 'true' is specified
- // only the mean, standard deviation, and other statistics are reported for
- // repeated benchmarks. Affects all reporters.
--DEFINE_bool(benchmark_report_aggregates_only, false);
-+BM_DEFINE_bool(benchmark_report_aggregates_only, false);
- 
- // Display the result of each benchmark repetitions. When 'true' is specified
- // only the mean, standard deviation, and other statistics are displayed for
- // repeated benchmarks. Unlike benchmark_report_aggregates_only, only affects
- // the display reporter, but  *NOT* file reporter, which will still contain
- // all the output.
--DEFINE_bool(benchmark_display_aggregates_only, false);
-+BM_DEFINE_bool(benchmark_display_aggregates_only, false);
- 
- // The format to use for console output.
- // Valid values are 'console', 'json', or 'csv'.
--DEFINE_string(benchmark_format, "console");
-+BM_DEFINE_string(benchmark_format, "console");
- 
- // The format to use for file output.
- // Valid values are 'console', 'json', or 'csv'.
--DEFINE_string(benchmark_out_format, "json");
-+BM_DEFINE_string(benchmark_out_format, "json");
- 
- // The file to write additional output to.
--DEFINE_string(benchmark_out, "");
-+BM_DEFINE_string(benchmark_out, "");
- 
- // Whether to use colors in the output.  Valid values:
- // 'true'/'yes'/1, 'false'/'no'/0, and 'auto'. 'auto' means to use colors if
- // the output is being sent to a terminal and the TERM environment variable is
- // set to a terminal type that supports colors.
--DEFINE_string(benchmark_color, "auto");
-+BM_DEFINE_string(benchmark_color, "auto");
- 
- // Whether to use tabular format when printing user counters to the console.
- // Valid values: 'true'/'yes'/1, 'false'/'no'/0.  Defaults to false.
--DEFINE_bool(benchmark_counters_tabular, false);
-+BM_DEFINE_bool(benchmark_counters_tabular, false);
- 
--// The level of verbose logging to output
--DEFINE_int32(v, 0);
-+// List of additional perf counters to collect, in libpfm format. For more
-+// information about libpfm: https://man7.org/linux/man-pages/man3/libpfm.3.html
-+BM_DEFINE_string(benchmark_perf_counters, "");
- 
--namespace benchmark {
-+// Extra context to include in the output formatted as comma-separated key-value
-+// pairs. Kept internal as it's only used for parsing from env/command line.
-+BM_DEFINE_kvpairs(benchmark_context, {});
-+
-+// The level of verbose logging to output
-+BM_DEFINE_int32(v, 0);
- 
- namespace internal {
- 
-+std::map<std::string, std::string>* global_context = nullptr;
-+
- // FIXME: wouldn't LTO mess this up?
- void UseCharPointer(char const volatile*) {}
- 
-@@ -117,7 +135,8 @@ void UseCharPointer(char const volatile*) {}
- 
- State::State(IterationCount max_iters, const std::vector<int64_t>& ranges,
-              int thread_i, int n_threads, internal::ThreadTimer* timer,
--             internal::ThreadManager* manager)
-+             internal::ThreadManager* manager,
-+             internal::PerfCountersMeasurement* perf_counters_measurement)
-     : total_iterations_(0),
-       batch_leftover_(0),
-       max_iterations(max_iters),
-@@ -127,12 +146,14 @@ State::State(IterationCount max_iters, const std::vector<int64_t>& ranges,
-       range_(ranges),
-       complexity_n_(0),
-       counters(),
--      thread_index(thread_i),
--      threads(n_threads),
-+      thread_index_(thread_i),
-+      threads_(n_threads),
-       timer_(timer),
--      manager_(manager) {
--  CHECK(max_iterations != 0) << "At least one iteration must be run";
--  CHECK_LT(thread_index, threads) << "thread_index must be less than threads";
-+      manager_(manager),
-+      perf_counters_measurement_(perf_counters_measurement) {
-+  BM_CHECK(max_iterations != 0) << "At least one iteration must be run";
-+  BM_CHECK_LT(thread_index_, threads_)
-+      << "thread_index must be less than threads";
- 
-   // Note: The use of offsetof below is technically undefined until C++17
-   // because State is not a standard layout type. However, all compilers
-@@ -161,17 +182,29 @@ State::State(IterationCount max_iters, const std::vector<int64_t>& ranges,
- 
- void State::PauseTiming() {
-   // Add in time accumulated so far
--  CHECK(started_ && !finished_ && !error_occurred_);
-+  BM_CHECK(started_ && !finished_ && !error_occurred_);
-   timer_->StopTimer();
-+  if (perf_counters_measurement_) {
-+    auto measurements = perf_counters_measurement_->StopAndGetMeasurements();
-+    for (const auto& name_and_measurement : measurements) {
-+      auto name = name_and_measurement.first;
-+      auto measurement = name_and_measurement.second;
-+      BM_CHECK_EQ(counters[name], 0.0);
-+      counters[name] = Counter(measurement, Counter::kAvgIterations);
-+    }
-+  }
- }
- 
- void State::ResumeTiming() {
--  CHECK(started_ && !finished_ && !error_occurred_);
-+  BM_CHECK(started_ && !finished_ && !error_occurred_);
-   timer_->StartTimer();
-+  if (perf_counters_measurement_) {
-+    perf_counters_measurement_->Start();
-+  }
- }
- 
- void State::SkipWithError(const char* msg) {
--  CHECK(msg);
-+  BM_CHECK(msg);
-   error_occurred_ = true;
-   {
-     MutexLock l(manager_->GetBenchmarkMutex());
-@@ -194,7 +227,7 @@ void State::SetLabel(const char* label) {
- }
- 
- void State::StartKeepRunning() {
--  CHECK(!started_ && !finished_);
-+  BM_CHECK(!started_ && !finished_);
-   started_ = true;
-   total_iterations_ = error_occurred_ ? 0 : max_iterations;
-   manager_->StartStopBarrier();
-@@ -202,7 +235,7 @@ void State::StartKeepRunning() {
- }
- 
- void State::FinishKeepRunning() {
--  CHECK(started_ && (!finished_ || error_occurred_));
-+  BM_CHECK(started_ && (!finished_ || error_occurred_));
-   if (!error_occurred_) {
-     PauseTiming();
-   }
-@@ -215,11 +248,42 @@ void State::FinishKeepRunning() {
- namespace internal {
- namespace {
- 
-+// Flushes streams after invoking reporter methods that write to them. This
-+// ensures users get timely updates even when streams are not line-buffered.
-+void FlushStreams(BenchmarkReporter* reporter) {
-+  if (!reporter) return;
-+  std::flush(reporter->GetOutputStream());
-+  std::flush(reporter->GetErrorStream());
-+}
-+
-+// Reports in both display and file reporters.
-+void Report(BenchmarkReporter* display_reporter,
-+            BenchmarkReporter* file_reporter, const RunResults& run_results) {
-+  auto report_one = [](BenchmarkReporter* reporter, bool aggregates_only,
-+                       const RunResults& results) {
-+    assert(reporter);
-+    // If there are no aggregates, do output non-aggregates.
-+    aggregates_only &= !results.aggregates_only.empty();
-+    if (!aggregates_only) reporter->ReportRuns(results.non_aggregates);
-+    if (!results.aggregates_only.empty())
-+      reporter->ReportRuns(results.aggregates_only);
-+  };
-+
-+  report_one(display_reporter, run_results.display_report_aggregates_only,
-+             run_results);
-+  if (file_reporter)
-+    report_one(file_reporter, run_results.file_report_aggregates_only,
-+               run_results);
-+
-+  FlushStreams(display_reporter);
-+  FlushStreams(file_reporter);
-+}
-+
- void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
-                    BenchmarkReporter* display_reporter,
-                    BenchmarkReporter* file_reporter) {
-   // Note the file_reporter can be null.
--  CHECK(display_reporter != nullptr);
-+  BM_CHECK(display_reporter != nullptr);
- 
-   // Determine the width of the name field using a minimum width of 10.
-   bool might_have_aggregates = FLAGS_benchmark_repetitions > 1;
-@@ -227,10 +291,10 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
-   size_t stat_field_width = 0;
-   for (const BenchmarkInstance& benchmark : benchmarks) {
-     name_field_width =
--        std::max<size_t>(name_field_width, benchmark.name.str().size());
--    might_have_aggregates |= benchmark.repetitions > 1;
-+        std::max<size_t>(name_field_width, benchmark.name().str().size());
-+    might_have_aggregates |= benchmark.repetitions() > 1;
- 
--    for (const auto& Stat : *benchmark.statistics)
-+    for (const auto& Stat : benchmark.statistics())
-       stat_field_width = std::max<size_t>(stat_field_width, Stat.name_.size());
-   }
-   if (might_have_aggregates) name_field_width += 1 + stat_field_width;
-@@ -239,55 +303,84 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
-   BenchmarkReporter::Context context;
-   context.name_field_width = name_field_width;
- 
--  // Keep track of running times of all instances of current benchmark
--  std::vector<BenchmarkReporter::Run> complexity_reports;
--
--  // We flush streams after invoking reporter methods that write to them. This
--  // ensures users get timely updates even when streams are not line-buffered.
--  auto flushStreams = [](BenchmarkReporter* reporter) {
--    if (!reporter) return;
--    std::flush(reporter->GetOutputStream());
--    std::flush(reporter->GetErrorStream());
--  };
-+  // Keep track of running times of all instances of each benchmark family.
-+  std::map<int /*family_index*/, BenchmarkReporter::PerFamilyRunReports>
-+      per_family_reports;
- 
-   if (display_reporter->ReportContext(context) &&
-       (!file_reporter || file_reporter->ReportContext(context))) {
--    flushStreams(display_reporter);
--    flushStreams(file_reporter);
--
--    for (const auto& benchmark : benchmarks) {
--      RunResults run_results = RunBenchmark(benchmark, &complexity_reports);
--
--      auto report = [&run_results](BenchmarkReporter* reporter,
--                                   bool report_aggregates_only) {
--        assert(reporter);
--        // If there are no aggregates, do output non-aggregates.
--        report_aggregates_only &= !run_results.aggregates_only.empty();
--        if (!report_aggregates_only)
--          reporter->ReportRuns(run_results.non_aggregates);
--        if (!run_results.aggregates_only.empty())
--          reporter->ReportRuns(run_results.aggregates_only);
--      };
--
--      report(display_reporter, run_results.display_report_aggregates_only);
--      if (file_reporter)
--        report(file_reporter, run_results.file_report_aggregates_only);
--
--      flushStreams(display_reporter);
--      flushStreams(file_reporter);
-+    FlushStreams(display_reporter);
-+    FlushStreams(file_reporter);
-+
-+    size_t num_repetitions_total = 0;
-+
-+    std::vector<internal::BenchmarkRunner> runners;
-+    runners.reserve(benchmarks.size());
-+    for (const BenchmarkInstance& benchmark : benchmarks) {
-+      BenchmarkReporter::PerFamilyRunReports* reports_for_family = nullptr;
-+      if (benchmark.complexity() != oNone)
-+        reports_for_family = &per_family_reports[benchmark.family_index()];
-+
-+      runners.emplace_back(benchmark, reports_for_family);
-+      int num_repeats_of_this_instance = runners.back().GetNumRepeats();
-+      num_repetitions_total += num_repeats_of_this_instance;
-+      if (reports_for_family)
-+        reports_for_family->num_runs_total += num_repeats_of_this_instance;
-+    }
-+    assert(runners.size() == benchmarks.size() && "Unexpected runner count.");
-+
-+    std::vector<size_t> repetition_indices;
-+    repetition_indices.reserve(num_repetitions_total);
-+    for (size_t runner_index = 0, num_runners = runners.size();
-+         runner_index != num_runners; ++runner_index) {
-+      const internal::BenchmarkRunner& runner = runners[runner_index];
-+      std::fill_n(std::back_inserter(repetition_indices),
-+                  runner.GetNumRepeats(), runner_index);
-+    }
-+    assert(repetition_indices.size() == num_repetitions_total &&
-+           "Unexpected number of repetition indexes.");
-+
-+    if (FLAGS_benchmark_enable_random_interleaving) {
-+      std::random_device rd;
-+      std::mt19937 g(rd());
-+      std::shuffle(repetition_indices.begin(), repetition_indices.end(), g);
-+    }
-+
-+    for (size_t repetition_index : repetition_indices) {
-+      internal::BenchmarkRunner& runner = runners[repetition_index];
-+      runner.DoOneRepetition();
-+      if (runner.HasRepeatsRemaining()) continue;
-+      // FIXME: report each repetition separately, not all of them in bulk.
-+
-+      RunResults run_results = runner.GetResults();
-+
-+      // Maybe calculate complexity report
-+      if (const auto* reports_for_family = runner.GetReportsForFamily()) {
-+        if (reports_for_family->num_runs_done ==
-+            reports_for_family->num_runs_total) {
-+          auto additional_run_stats = ComputeBigO(reports_for_family->Runs);
-+          run_results.aggregates_only.insert(run_results.aggregates_only.end(),
-+                                             additional_run_stats.begin(),
-+                                             additional_run_stats.end());
-+          per_family_reports.erase(
-+              (int)reports_for_family->Runs.front().family_index);
-+        }
-+      }
-+
-+      Report(display_reporter, file_reporter, run_results);
-     }
-   }
-   display_reporter->Finalize();
-   if (file_reporter) file_reporter->Finalize();
--  flushStreams(display_reporter);
--  flushStreams(file_reporter);
-+  FlushStreams(display_reporter);
-+  FlushStreams(file_reporter);
- }
- 
- // Disable deprecated warnings temporarily because we need to reference
--// CSVReporter but don't want to trigger -Werror=-Wdeprecated
-+// CSVReporter but don't want to trigger -Werror=-Wdeprecated-declarations
- #ifdef __GNUC__
- #pragma GCC diagnostic push
--#pragma GCC diagnostic ignored "-Wdeprecated"
-+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
- #endif
- 
- std::unique_ptr<BenchmarkReporter> CreateReporter(
-@@ -377,7 +470,7 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
-   if (!fname.empty()) {
-     output_file.open(fname);
-     if (!output_file.is_open()) {
--      Err << "invalid file name: '" << fname << std::endl;
-+      Err << "invalid file name: '" << fname << "'" << std::endl;
-       std::exit(1);
-     }
-     if (!file_reporter) {
-@@ -399,7 +492,7 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
- 
-   if (FLAGS_benchmark_list_tests) {
-     for (auto const& benchmark : benchmarks)
--      Out << benchmark.name.str() << "\n";
-+      Out << benchmark.name().str() << "\n";
-   } else {
-     internal::RunBenchmarks(benchmarks, display_reporter, file_reporter);
-   }
-@@ -411,6 +504,16 @@ void RegisterMemoryManager(MemoryManager* manager) {
-   internal::memory_manager = manager;
- }
- 
-+void AddCustomContext(const std::string& key, const std::string& value) {
-+  if (internal::global_context == nullptr) {
-+    internal::global_context = new std::map<std::string, std::string>();
-+  }
-+  if (!internal::global_context->emplace(key, value).second) {
-+    std::cerr << "Failed to add custom context \"" << key << "\" as it already "
-+              << "exists with value \"" << value << "\"\n";
-+  }
-+}
-+
- namespace internal {
- 
- void PrintUsageAndExit() {
-@@ -420,6 +523,7 @@ void PrintUsageAndExit() {
-           "          [--benchmark_filter=<regex>]\n"
-           "          [--benchmark_min_time=<min_time>]\n"
-           "          [--benchmark_repetitions=<num_repetitions>]\n"
-+          "          [--benchmark_enable_random_interleaving={true|false}]\n"
-           "          [--benchmark_report_aggregates_only={true|false}]\n"
-           "          [--benchmark_display_aggregates_only={true|false}]\n"
-           "          [--benchmark_format=<console|json|csv>]\n"
-@@ -427,6 +531,8 @@ void PrintUsageAndExit() {
-           "          [--benchmark_out_format=<json|console|csv>]\n"
-           "          [--benchmark_color={auto|true|false}]\n"
-           "          [--benchmark_counters_tabular={true|false}]\n"
-+          "          [--benchmark_perf_counters=<counter>,...]\n"
-+          "          [--benchmark_context=<key>=<value>,...]\n"
-           "          [--v=<verbosity>]\n");
-   exit(0);
- }
-@@ -443,6 +549,8 @@ void ParseCommandLineFlags(int* argc, char** argv) {
-                         &FLAGS_benchmark_min_time) ||
-         ParseInt32Flag(argv[i], "benchmark_repetitions",
-                        &FLAGS_benchmark_repetitions) ||
-+        ParseBoolFlag(argv[i], "benchmark_enable_random_interleaving",
-+                      &FLAGS_benchmark_enable_random_interleaving) ||
-         ParseBoolFlag(argv[i], "benchmark_report_aggregates_only",
-                       &FLAGS_benchmark_report_aggregates_only) ||
-         ParseBoolFlag(argv[i], "benchmark_display_aggregates_only",
-@@ -457,6 +565,10 @@ void ParseCommandLineFlags(int* argc, char** argv) {
-         ParseStringFlag(argv[i], "color_print", &FLAGS_benchmark_color) ||
-         ParseBoolFlag(argv[i], "benchmark_counters_tabular",
-                       &FLAGS_benchmark_counters_tabular) ||
-+        ParseStringFlag(argv[i], "benchmark_perf_counters",
-+                        &FLAGS_benchmark_perf_counters) ||
-+        ParseKeyValueFlag(argv[i], "benchmark_context",
-+                          &FLAGS_benchmark_context) ||
-         ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
-       for (int j = i; j != *argc - 1; ++j) argv[j] = argv[j + 1];
- 
-@@ -467,13 +579,17 @@ void ParseCommandLineFlags(int* argc, char** argv) {
-     }
-   }
-   for (auto const* flag :
--       {&FLAGS_benchmark_format, &FLAGS_benchmark_out_format})
-+       {&FLAGS_benchmark_format, &FLAGS_benchmark_out_format}) {
-     if (*flag != "console" && *flag != "json" && *flag != "csv") {
-       PrintUsageAndExit();
-     }
-+  }
-   if (FLAGS_benchmark_color.empty()) {
-     PrintUsageAndExit();
-   }
-+  for (const auto& kv : FLAGS_benchmark_context) {
-+    AddCustomContext(kv.first, kv.second);
-+  }
- }
- 
- int InitializeStreams() {
-@@ -488,6 +604,10 @@ void Initialize(int* argc, char** argv) {
-   internal::LogLevel() = FLAGS_v;
- }
- 
-+void Shutdown() {
-+  delete internal::global_context;
-+}
-+
- bool ReportUnrecognizedArguments(int argc, char** argv) {
-   for (int i = 1; i < argc; ++i) {
-     fprintf(stderr, "%s: error: unrecognized command-line flag: %s\n", argv[0],
-diff --git a/lib/benchmark/src/benchmark_api_internal.cc b/lib/benchmark/src/benchmark_api_internal.cc
-index 83c4eeb5c3..89da519afc 100644
---- a/lib/benchmark/src/benchmark_api_internal.cc
-+++ b/lib/benchmark/src/benchmark_api_internal.cc
-@@ -1,15 +1,94 @@
- #include "benchmark_api_internal.h"
- 
-+#include <cinttypes>
-+
-+#include "string_util.h"
-+
- namespace benchmark {
- namespace internal {
- 
--State BenchmarkInstance::Run(IterationCount iters, int thread_id,
--                             internal::ThreadTimer* timer,
--                             internal::ThreadManager* manager) const {
--  State st(iters, arg, thread_id, threads, timer, manager);
--  benchmark->Run(st);
-+BenchmarkInstance::BenchmarkInstance(Benchmark* benchmark, int family_idx,
-+                                     int per_family_instance_idx,
-+                                     const std::vector<int64_t>& args,
-+                                     int thread_count)
-+    : benchmark_(*benchmark),
-+      family_index_(family_idx),
-+      per_family_instance_index_(per_family_instance_idx),
-+      aggregation_report_mode_(benchmark_.aggregation_report_mode_),
-+      args_(args),
-+      time_unit_(benchmark_.time_unit_),
-+      measure_process_cpu_time_(benchmark_.measure_process_cpu_time_),
-+      use_real_time_(benchmark_.use_real_time_),
-+      use_manual_time_(benchmark_.use_manual_time_),
-+      complexity_(benchmark_.complexity_),
-+      complexity_lambda_(benchmark_.complexity_lambda_),
-+      statistics_(benchmark_.statistics_),
-+      repetitions_(benchmark_.repetitions_),
-+      min_time_(benchmark_.min_time_),
-+      iterations_(benchmark_.iterations_),
-+      threads_(thread_count) {
-+  name_.function_name = benchmark_.name_;
-+
-+  size_t arg_i = 0;
-+  for (const auto& arg : args) {
-+    if (!name_.args.empty()) {
-+      name_.args += '/';
-+    }
-+
-+    if (arg_i < benchmark->arg_names_.size()) {
-+      const auto& arg_name = benchmark_.arg_names_[arg_i];
-+      if (!arg_name.empty()) {
-+        name_.args += StrFormat("%s:", arg_name.c_str());
-+      }
-+    }
-+
-+    name_.args += StrFormat("%" PRId64, arg);
-+    ++arg_i;
-+  }
-+
-+  if (!IsZero(benchmark->min_time_)) {
-+    name_.min_time = StrFormat("min_time:%0.3f", benchmark_.min_time_);
-+  }
-+
-+  if (benchmark_.iterations_ != 0) {
-+    name_.iterations = StrFormat(
-+        "iterations:%lu", static_cast<unsigned long>(benchmark_.iterations_));
-+  }
-+
-+  if (benchmark_.repetitions_ != 0) {
-+    name_.repetitions = StrFormat("repeats:%d", benchmark_.repetitions_);
-+  }
-+
-+  if (benchmark_.measure_process_cpu_time_) {
-+    name_.time_type = "process_time";
-+  }
-+
-+  if (benchmark_.use_manual_time_) {
-+    if (!name_.time_type.empty()) {
-+      name_.time_type += '/';
-+    }
-+    name_.time_type += "manual_time";
-+  } else if (benchmark_.use_real_time_) {
-+    if (!name_.time_type.empty()) {
-+      name_.time_type += '/';
-+    }
-+    name_.time_type += "real_time";
-+  }
-+
-+  if (!benchmark_.thread_counts_.empty()) {
-+    name_.threads = StrFormat("threads:%d", threads_);
-+  }
-+}
-+
-+State BenchmarkInstance::Run(
-+    IterationCount iters, int thread_id, internal::ThreadTimer* timer,
-+    internal::ThreadManager* manager,
-+    internal::PerfCountersMeasurement* perf_counters_measurement) const {
-+  State st(iters, args_, thread_id, threads_, timer, manager,
-+           perf_counters_measurement);
-+  benchmark_.Run(st);
-   return st;
- }
- 
--} // namespace internal
--} // namespace benchmark
-+}  // namespace internal
-+}  // namespace benchmark
-diff --git a/lib/benchmark/src/benchmark_api_internal.h b/lib/benchmark/src/benchmark_api_internal.h
-index 264eff95c5..592dd46303 100644
---- a/lib/benchmark/src/benchmark_api_internal.h
-+++ b/lib/benchmark/src/benchmark_api_internal.h
-@@ -1,9 +1,6 @@
- #ifndef BENCHMARK_API_INTERNAL_H
- #define BENCHMARK_API_INTERNAL_H
- 
--#include "benchmark/benchmark.h"
--#include "commandlineflags.h"
--
- #include <cmath>
- #include <iosfwd>
- #include <limits>
-@@ -11,32 +8,60 @@
- #include <string>
- #include <vector>
- 
-+#include "benchmark/benchmark.h"
-+#include "commandlineflags.h"
-+
- namespace benchmark {
- namespace internal {
- 
- // Information kept per benchmark we may want to run
--struct BenchmarkInstance {
--  BenchmarkName name;
--  Benchmark* benchmark;
--  AggregationReportMode aggregation_report_mode;
--  std::vector<int64_t> arg;
--  TimeUnit time_unit;
--  int range_multiplier;
--  bool measure_process_cpu_time;
--  bool use_real_time;
--  bool use_manual_time;
--  BigO complexity;
--  BigOFunc* complexity_lambda;
--  UserCounters counters;
--  const std::vector<Statistics>* statistics;
--  bool last_benchmark_instance;
--  int repetitions;
--  double min_time;
--  IterationCount iterations;
--  int threads;  // Number of concurrent threads to us
-+class BenchmarkInstance {
-+ public:
-+  BenchmarkInstance(Benchmark* benchmark, int family_index,
-+                    int per_family_instance_index,
-+                    const std::vector<int64_t>& args, int threads);
-+
-+  const BenchmarkName& name() const { return name_; }
-+  int family_index() const { return family_index_; }
-+  int per_family_instance_index() const { return per_family_instance_index_; }
-+  AggregationReportMode aggregation_report_mode() const {
-+    return aggregation_report_mode_;
-+  }
-+  TimeUnit time_unit() const { return time_unit_; }
-+  bool measure_process_cpu_time() const { return measure_process_cpu_time_; }
-+  bool use_real_time() const { return use_real_time_; }
-+  bool use_manual_time() const { return use_manual_time_; }
-+  BigO complexity() const { return complexity_; }
-+  BigOFunc* complexity_lambda() const { return complexity_lambda_; }
-+  const std::vector<Statistics>& statistics() const { return statistics_; }
-+  int repetitions() const { return repetitions_; }
-+  double min_time() const { return min_time_; }
-+  IterationCount iterations() const { return iterations_; }
-+  int threads() const { return threads_; }
- 
-   State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer,
--            internal::ThreadManager* manager) const;
-+            internal::ThreadManager* manager,
-+            internal::PerfCountersMeasurement* perf_counters_measurement) const;
-+
-+ private:
-+  BenchmarkName name_;
-+  Benchmark& benchmark_;
-+  const int family_index_;
-+  const int per_family_instance_index_;
-+  AggregationReportMode aggregation_report_mode_;
-+  const std::vector<int64_t>& args_;
-+  TimeUnit time_unit_;
-+  bool measure_process_cpu_time_;
-+  bool use_real_time_;
-+  bool use_manual_time_;
-+  BigO complexity_;
-+  BigOFunc* complexity_lambda_;
-+  UserCounters counters_;
-+  const std::vector<Statistics>& statistics_;
-+  int repetitions_;
-+  double min_time_;
-+  IterationCount iterations_;
-+  int threads_;  // Number of concurrent threads to us
- };
- 
- bool FindBenchmarksInternal(const std::string& re,
-diff --git a/lib/benchmark/src/benchmark_register.cc b/lib/benchmark/src/benchmark_register.cc
-index cca39b2215..f2b32bdbf8 100644
---- a/lib/benchmark/src/benchmark_register.cc
-+++ b/lib/benchmark/src/benchmark_register.cc
-@@ -24,6 +24,7 @@
- 
- #include <algorithm>
- #include <atomic>
-+#include <cinttypes>
- #include <condition_variable>
- #include <cstdio>
- #include <cstdlib>
-@@ -31,14 +32,10 @@
- #include <fstream>
- #include <iostream>
- #include <memory>
-+#include <numeric>
- #include <sstream>
- #include <thread>
- 
--#ifndef __STDC_FORMAT_MACROS
--#define __STDC_FORMAT_MACROS
--#endif
--#include <inttypes.h>
--
- #include "benchmark/benchmark.h"
- #include "benchmark_api_internal.h"
- #include "check.h"
-@@ -114,7 +111,7 @@ void BenchmarkFamilies::ClearBenchmarks() {
- bool BenchmarkFamilies::FindBenchmarks(
-     std::string spec, std::vector<BenchmarkInstance>* benchmarks,
-     std::ostream* ErrStream) {
--  CHECK(ErrStream);
-+  BM_CHECK(ErrStream);
-   auto& Err = *ErrStream;
-   // Make regular expression out of command-line flag
-   std::string error_msg;
-@@ -132,8 +129,13 @@ bool BenchmarkFamilies::FindBenchmarks(
-   // Special list of thread counts to use when none are specified
-   const std::vector<int> one_thread = {1};
- 
-+  int next_family_index = 0;
-+
-   MutexLock l(mutex_);
-   for (std::unique_ptr<Benchmark>& family : families_) {
-+    int family_index = next_family_index;
-+    int per_family_instance_index = 0;
-+
-     // Family was deleted or benchmark doesn't match
-     if (!family) continue;
- 
-@@ -153,84 +155,24 @@ bool BenchmarkFamilies::FindBenchmarks(
-     }
-     // reserve in the special case the regex ".", since we know the final
-     // family size.
--    if (spec == ".") benchmarks->reserve(family_size);
-+    if (spec == ".") benchmarks->reserve(benchmarks->size() + family_size);
- 
-     for (auto const& args : family->args_) {
-       for (int num_threads : *thread_counts) {
--        BenchmarkInstance instance;
--        instance.name.function_name = family->name_;
--        instance.benchmark = family.get();
--        instance.aggregation_report_mode = family->aggregation_report_mode_;
--        instance.arg = args;
--        instance.time_unit = family->time_unit_;
--        instance.range_multiplier = family->range_multiplier_;
--        instance.min_time = family->min_time_;
--        instance.iterations = family->iterations_;
--        instance.repetitions = family->repetitions_;
--        instance.measure_process_cpu_time = family->measure_process_cpu_time_;
--        instance.use_real_time = family->use_real_time_;
--        instance.use_manual_time = family->use_manual_time_;
--        instance.complexity = family->complexity_;
--        instance.complexity_lambda = family->complexity_lambda_;
--        instance.statistics = &family->statistics_;
--        instance.threads = num_threads;
--
--        // Add arguments to instance name
--        size_t arg_i = 0;
--        for (auto const& arg : args) {
--          if (!instance.name.args.empty()) {
--            instance.name.args += '/';
--          }
--
--          if (arg_i < family->arg_names_.size()) {
--            const auto& arg_name = family->arg_names_[arg_i];
--            if (!arg_name.empty()) {
--              instance.name.args += StrFormat("%s:", arg_name.c_str());
--            }
--          }
--
--          instance.name.args += StrFormat("%" PRId64, arg);
--          ++arg_i;
--        }
--
--        if (!IsZero(family->min_time_))
--          instance.name.min_time =
--              StrFormat("min_time:%0.3f", family->min_time_);
--        if (family->iterations_ != 0) {
--          instance.name.iterations =
--              StrFormat("iterations:%lu",
--                        static_cast<unsigned long>(family->iterations_));
--        }
--        if (family->repetitions_ != 0)
--          instance.name.repetitions =
--              StrFormat("repeats:%d", family->repetitions_);
--
--        if (family->measure_process_cpu_time_) {
--          instance.name.time_type = "process_time";
--        }
-+        BenchmarkInstance instance(family.get(), family_index,
-+                                   per_family_instance_index, args,
-+                                   num_threads);
- 
--        if (family->use_manual_time_) {
--          if (!instance.name.time_type.empty()) {
--            instance.name.time_type += '/';
--          }
--          instance.name.time_type += "manual_time";
--        } else if (family->use_real_time_) {
--          if (!instance.name.time_type.empty()) {
--            instance.name.time_type += '/';
--          }
--          instance.name.time_type += "real_time";
--        }
--
--        // Add the number of threads used to the name
--        if (!family->thread_counts_.empty()) {
--          instance.name.threads = StrFormat("threads:%d", instance.threads);
--        }
--
--        const auto full_name = instance.name.str();
-+        const auto full_name = instance.name().str();
-         if ((re.Match(full_name) && !isNegativeFilter) ||
-             (!re.Match(full_name) && isNegativeFilter)) {
--          instance.last_benchmark_instance = (&args == &family->args_.back());
-           benchmarks->push_back(std::move(instance));
-+
-+          ++per_family_instance_index;
-+
-+          // Only bump the next family index once we've estabilished that
-+          // at least one instance of this family will be run.
-+          if (next_family_index == family_index) ++next_family_index;
-         }
-       }
-     }
-@@ -273,12 +215,18 @@ Benchmark::Benchmark(const char* name)
-   ComputeStatistics("mean", StatisticsMean);
-   ComputeStatistics("median", StatisticsMedian);
-   ComputeStatistics("stddev", StatisticsStdDev);
-+  ComputeStatistics("cv", StatisticsCV, kPercentage);
- }
- 
- Benchmark::~Benchmark() {}
- 
-+Benchmark* Benchmark::Name(const std::string& name) {
-+  SetName(name.c_str());
-+  return this;
-+}
-+
- Benchmark* Benchmark::Arg(int64_t x) {
--  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
-+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
-   args_.push_back({x});
-   return this;
- }
-@@ -289,7 +237,7 @@ Benchmark* Benchmark::Unit(TimeUnit unit) {
- }
- 
- Benchmark* Benchmark::Range(int64_t start, int64_t limit) {
--  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
-+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
-   std::vector<int64_t> arglist;
-   AddRange(&arglist, start, limit, range_multiplier_);
- 
-@@ -301,53 +249,61 @@ Benchmark* Benchmark::Range(int64_t start, int64_t limit) {
- 
- Benchmark* Benchmark::Ranges(
-     const std::vector<std::pair<int64_t, int64_t>>& ranges) {
--  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(ranges.size()));
-+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(ranges.size()));
-   std::vector<std::vector<int64_t>> arglists(ranges.size());
--  std::size_t total = 1;
-   for (std::size_t i = 0; i < ranges.size(); i++) {
-     AddRange(&arglists[i], ranges[i].first, ranges[i].second,
-              range_multiplier_);
--    total *= arglists[i].size();
-   }
- 
--  std::vector<std::size_t> ctr(arglists.size(), 0);
--
--  for (std::size_t i = 0; i < total; i++) {
--    std::vector<int64_t> tmp;
--    tmp.reserve(arglists.size());
-+  ArgsProduct(arglists);
- 
--    for (std::size_t j = 0; j < arglists.size(); j++) {
--      tmp.push_back(arglists[j].at(ctr[j]));
--    }
-+  return this;
-+}
- 
--    args_.push_back(std::move(tmp));
-+Benchmark* Benchmark::ArgsProduct(
-+    const std::vector<std::vector<int64_t>>& arglists) {
-+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(arglists.size()));
- 
--    for (std::size_t j = 0; j < arglists.size(); j++) {
--      if (ctr[j] + 1 < arglists[j].size()) {
--        ++ctr[j];
--        break;
--      }
--      ctr[j] = 0;
-+  std::vector<std::size_t> indices(arglists.size());
-+  const std::size_t total = std::accumulate(
-+      std::begin(arglists), std::end(arglists), std::size_t{1},
-+      [](const std::size_t res, const std::vector<int64_t>& arglist) {
-+        return res * arglist.size();
-+      });
-+  std::vector<int64_t> args;
-+  args.reserve(arglists.size());
-+  for (std::size_t i = 0; i < total; i++) {
-+    for (std::size_t arg = 0; arg < arglists.size(); arg++) {
-+      args.push_back(arglists[arg][indices[arg]]);
-     }
-+    args_.push_back(args);
-+    args.clear();
-+
-+    std::size_t arg = 0;
-+    do {
-+      indices[arg] = (indices[arg] + 1) % arglists[arg].size();
-+    } while (indices[arg++] == 0 && arg < arglists.size());
-   }
-+
-   return this;
- }
- 
- Benchmark* Benchmark::ArgName(const std::string& name) {
--  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
-+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
-   arg_names_ = {name};
-   return this;
- }
- 
- Benchmark* Benchmark::ArgNames(const std::vector<std::string>& names) {
--  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(names.size()));
-+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(names.size()));
-   arg_names_ = names;
-   return this;
- }
- 
- Benchmark* Benchmark::DenseRange(int64_t start, int64_t limit, int step) {
--  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
--  CHECK_LE(start, limit);
-+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
-+  BM_CHECK_LE(start, limit);
-   for (int64_t arg = start; arg <= limit; arg += step) {
-     args_.push_back({arg});
-   }
-@@ -355,7 +311,7 @@ Benchmark* Benchmark::DenseRange(int64_t start, int64_t limit, int step) {
- }
- 
- Benchmark* Benchmark::Args(const std::vector<int64_t>& args) {
--  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(args.size()));
-+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(args.size()));
-   args_.push_back(args);
-   return this;
- }
-@@ -366,27 +322,27 @@ Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) {
- }
- 
- Benchmark* Benchmark::RangeMultiplier(int multiplier) {
--  CHECK(multiplier > 1);
-+  BM_CHECK(multiplier > 1);
-   range_multiplier_ = multiplier;
-   return this;
- }
- 
- Benchmark* Benchmark::MinTime(double t) {
--  CHECK(t > 0.0);
--  CHECK(iterations_ == 0);
-+  BM_CHECK(t > 0.0);
-+  BM_CHECK(iterations_ == 0);
-   min_time_ = t;
-   return this;
- }
- 
- Benchmark* Benchmark::Iterations(IterationCount n) {
--  CHECK(n > 0);
--  CHECK(IsZero(min_time_));
-+  BM_CHECK(n > 0);
-+  BM_CHECK(IsZero(min_time_));
-   iterations_ = n;
-   return this;
- }
- 
- Benchmark* Benchmark::Repetitions(int n) {
--  CHECK(n > 0);
-+  BM_CHECK(n > 0);
-   repetitions_ = n;
-   return this;
- }
-@@ -419,14 +375,14 @@ Benchmark* Benchmark::MeasureProcessCPUTime() {
- }
- 
- Benchmark* Benchmark::UseRealTime() {
--  CHECK(!use_manual_time_)
-+  BM_CHECK(!use_manual_time_)
-       << "Cannot set UseRealTime and UseManualTime simultaneously.";
-   use_real_time_ = true;
-   return this;
- }
- 
- Benchmark* Benchmark::UseManualTime() {
--  CHECK(!use_real_time_)
-+  BM_CHECK(!use_real_time_)
-       << "Cannot set UseRealTime and UseManualTime simultaneously.";
-   use_manual_time_ = true;
-   return this;
-@@ -444,20 +400,21 @@ Benchmark* Benchmark::Complexity(BigOFunc* complexity) {
- }
- 
- Benchmark* Benchmark::ComputeStatistics(std::string name,
--                                        StatisticsFunc* statistics) {
--  statistics_.emplace_back(name, statistics);
-+                                        StatisticsFunc* statistics,
-+                                        StatisticUnit unit) {
-+  statistics_.emplace_back(name, statistics, unit);
-   return this;
- }
- 
- Benchmark* Benchmark::Threads(int t) {
--  CHECK_GT(t, 0);
-+  BM_CHECK_GT(t, 0);
-   thread_counts_.push_back(t);
-   return this;
- }
- 
- Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {
--  CHECK_GT(min_threads, 0);
--  CHECK_GE(max_threads, min_threads);
-+  BM_CHECK_GT(min_threads, 0);
-+  BM_CHECK_GE(max_threads, min_threads);
- 
-   AddRange(&thread_counts_, min_threads, max_threads, 2);
-   return this;
-@@ -465,9 +422,9 @@ Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {
- 
- Benchmark* Benchmark::DenseThreadRange(int min_threads, int max_threads,
-                                        int stride) {
--  CHECK_GT(min_threads, 0);
--  CHECK_GE(max_threads, min_threads);
--  CHECK_GE(stride, 1);
-+  BM_CHECK_GT(min_threads, 0);
-+  BM_CHECK_GE(max_threads, min_threads);
-+  BM_CHECK_GE(stride, 1);
- 
-   for (auto i = min_threads; i < max_threads; i += stride) {
-     thread_counts_.push_back(i);
-@@ -503,4 +460,20 @@ void ClearRegisteredBenchmarks() {
-   internal::BenchmarkFamilies::GetInstance()->ClearBenchmarks();
- }
- 
-+std::vector<int64_t> CreateRange(int64_t lo, int64_t hi, int multi) {
-+  std::vector<int64_t> args;
-+  internal::AddRange(&args, lo, hi, multi);
-+  return args;
-+}
-+
-+std::vector<int64_t> CreateDenseRange(int64_t start, int64_t limit,
-+                                      int step) {
-+  BM_CHECK_LE(start, limit);
-+  std::vector<int64_t> args;
-+  for (int64_t arg = start; arg <= limit; arg += step) {
-+    args.push_back(arg);
-+  }
-+  return args;
-+}
-+
- }  // end namespace benchmark
-diff --git a/lib/benchmark/src/benchmark_register.h b/lib/benchmark/src/benchmark_register.h
-index 61377d7423..7033dbf622 100644
---- a/lib/benchmark/src/benchmark_register.h
-+++ b/lib/benchmark/src/benchmark_register.h
-@@ -1,6 +1,7 @@
- #ifndef BENCHMARK_REGISTER_H
- #define BENCHMARK_REGISTER_H
- 
-+#include <limits>
- #include <vector>
- 
- #include "check.h"
-@@ -13,16 +14,16 @@ namespace internal {
- template <typename T>
- typename std::vector<T>::iterator
- AddPowers(std::vector<T>* dst, T lo, T hi, int mult) {
--  CHECK_GE(lo, 0);
--  CHECK_GE(hi, lo);
--  CHECK_GE(mult, 2);
-+  BM_CHECK_GE(lo, 0);
-+  BM_CHECK_GE(hi, lo);
-+  BM_CHECK_GE(mult, 2);
- 
-   const size_t start_offset = dst->size();
- 
-   static const T kmax = std::numeric_limits<T>::max();
- 
-   // Space out the values in multiples of "mult"
--  for (T i = 1; i <= hi; i *= mult) {
-+  for (T i = static_cast<T>(1); i <= hi; i *= mult) {
-     if (i >= lo) {
-       dst->push_back(i);
-     }
-@@ -37,10 +38,10 @@ AddPowers(std::vector<T>* dst, T lo, T hi, int mult) {
- template <typename T>
- void AddNegatedPowers(std::vector<T>* dst, T lo, T hi, int mult) {
-   // We negate lo and hi so we require that they cannot be equal to 'min'.
--  CHECK_GT(lo, std::numeric_limits<T>::min());
--  CHECK_GT(hi, std::numeric_limits<T>::min());
--  CHECK_GE(hi, lo);
--  CHECK_LE(hi, 0);
-+  BM_CHECK_GT(lo, std::numeric_limits<T>::min());
-+  BM_CHECK_GT(hi, std::numeric_limits<T>::min());
-+  BM_CHECK_GE(hi, lo);
-+  BM_CHECK_LE(hi, 0);
- 
-   // Add positive powers, then negate and reverse.
-   // Casts necessary since small integers get promoted
-@@ -59,8 +60,8 @@ void AddRange(std::vector<T>* dst, T lo, T hi, int mult) {
-   static_assert(std::is_integral<T>::value && std::is_signed<T>::value,
-                 "Args type must be a signed integer");
- 
--  CHECK_GE(hi, lo);
--  CHECK_GE(mult, 2);
-+  BM_CHECK_GE(hi, lo);
-+  BM_CHECK_GE(mult, 2);
- 
-   // Add "lo"
-   dst->push_back(lo);
-@@ -86,7 +87,7 @@ void AddRange(std::vector<T>* dst, T lo, T hi, int mult) {
-   }
- 
-   // Treat 0 as a special case (see discussion on #762).
--  if (lo <= 0 && hi >= 0) {
-+  if (lo < 0 && hi >= 0) {
-     dst->push_back(0);
-   }
- 
-diff --git a/lib/benchmark/src/benchmark_runner.cc b/lib/benchmark/src/benchmark_runner.cc
-index c414eff9a9..ead5c5a26f 100644
---- a/lib/benchmark/src/benchmark_runner.cc
-+++ b/lib/benchmark/src/benchmark_runner.cc
-@@ -13,6 +13,7 @@
- // limitations under the License.
- 
- #include "benchmark_runner.h"
-+
- #include "benchmark/benchmark.h"
- #include "benchmark_api_internal.h"
- #include "internal_macros.h"
-@@ -45,6 +46,7 @@
- #include "internal_macros.h"
- #include "log.h"
- #include "mutex.h"
-+#include "perf_counters.h"
- #include "re.h"
- #include "statistics.h"
- #include "string_util.h"
-@@ -66,32 +68,34 @@ BenchmarkReporter::Run CreateRunReport(
-     const internal::ThreadManager::Result& results,
-     IterationCount memory_iterations,
-     const MemoryManager::Result& memory_result, double seconds,
--    int64_t repetition_index) {
-+    int64_t repetition_index, int64_t repeats) {
-   // Create report about this benchmark run.
-   BenchmarkReporter::Run report;
- 
--  report.run_name = b.name;
-+  report.run_name = b.name();
-+  report.family_index = b.family_index();
-+  report.per_family_instance_index = b.per_family_instance_index();
-   report.error_occurred = results.has_error_;
-   report.error_message = results.error_message_;
-   report.report_label = results.report_label_;
-   // This is the total iterations across all threads.
-   report.iterations = results.iterations;
--  report.time_unit = b.time_unit;
--  report.threads = b.threads;
-+  report.time_unit = b.time_unit();
-+  report.threads = b.threads();
-   report.repetition_index = repetition_index;
--  report.repetitions = b.repetitions;
-+  report.repetitions = repeats;
- 
-   if (!report.error_occurred) {
--    if (b.use_manual_time) {
-+    if (b.use_manual_time()) {
-       report.real_accumulated_time = results.manual_time_used;
-     } else {
-       report.real_accumulated_time = results.real_time_used;
-     }
-     report.cpu_accumulated_time = results.cpu_time_used;
-     report.complexity_n = results.complexity_n;
--    report.complexity = b.complexity;
--    report.complexity_lambda = b.complexity_lambda;
--    report.statistics = b.statistics;
-+    report.complexity = b.complexity();
-+    report.complexity_lambda = b.complexity_lambda();
-+    report.statistics = &b.statistics();
-     report.counters = results.counters;
- 
-     if (memory_iterations > 0) {
-@@ -103,21 +107,24 @@ BenchmarkReporter::Run CreateRunReport(
-       report.max_bytes_used = memory_result.max_bytes_used;
-     }
- 
--    internal::Finish(&report.counters, results.iterations, seconds, b.threads);
-+    internal::Finish(&report.counters, results.iterations, seconds,
-+                     b.threads());
-   }
-   return report;
- }
- 
- // Execute one thread of benchmark b for the specified number of iterations.
--// Adds the stats collected for the thread into *total.
-+// Adds the stats collected for the thread into manager->results.
- void RunInThread(const BenchmarkInstance* b, IterationCount iters,
--                 int thread_id, ThreadManager* manager) {
-+                 int thread_id, ThreadManager* manager,
-+                 PerfCountersMeasurement* perf_counters_measurement) {
-   internal::ThreadTimer timer(
--      b->measure_process_cpu_time
-+      b->measure_process_cpu_time()
-           ? internal::ThreadTimer::CreateProcessCpuTime()
-           : internal::ThreadTimer::Create());
--  State st = b->Run(iters, thread_id, &timer, manager);
--  CHECK(st.error_occurred() || st.iterations() >= st.max_iterations)
-+  State st =
-+      b->Run(iters, thread_id, &timer, manager, perf_counters_measurement);
-+  BM_CHECK(st.error_occurred() || st.iterations() >= st.max_iterations)
-       << "Benchmark returned before State::KeepRunning() returned false!";
-   {
-     MutexLock l(manager->GetBenchmarkMutex());
-@@ -132,228 +139,208 @@ void RunInThread(const BenchmarkInstance* b, IterationCount iters,
-   manager->NotifyThreadComplete();
- }
- 
--class BenchmarkRunner {
-- public:
--  BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
--                  std::vector<BenchmarkReporter::Run>* complexity_reports_)
--      : b(b_),
--        complexity_reports(*complexity_reports_),
--        min_time(!IsZero(b.min_time) ? b.min_time : FLAGS_benchmark_min_time),
--        repeats(b.repetitions != 0 ? b.repetitions
-+}  // end namespace
-+
-+BenchmarkRunner::BenchmarkRunner(
-+    const benchmark::internal::BenchmarkInstance& b_,
-+    BenchmarkReporter::PerFamilyRunReports* reports_for_family_)
-+    : b(b_),
-+      reports_for_family(reports_for_family_),
-+      min_time(!IsZero(b.min_time()) ? b.min_time() : FLAGS_benchmark_min_time),
-+      repeats(b.repetitions() != 0 ? b.repetitions()
-                                    : FLAGS_benchmark_repetitions),
--        has_explicit_iteration_count(b.iterations != 0),
--        pool(b.threads - 1),
--        iters(has_explicit_iteration_count ? b.iterations : 1) {
-+      has_explicit_iteration_count(b.iterations() != 0),
-+      pool(b.threads() - 1),
-+      iters(has_explicit_iteration_count ? b.iterations() : 1),
-+      perf_counters_measurement(
-+          PerfCounters::Create(StrSplit(FLAGS_benchmark_perf_counters, ','))),
-+      perf_counters_measurement_ptr(perf_counters_measurement.IsValid()
-+                                        ? &perf_counters_measurement
-+                                        : nullptr) {
-+  run_results.display_report_aggregates_only =
-+      (FLAGS_benchmark_report_aggregates_only ||
-+       FLAGS_benchmark_display_aggregates_only);
-+  run_results.file_report_aggregates_only =
-+      FLAGS_benchmark_report_aggregates_only;
-+  if (b.aggregation_report_mode() != internal::ARM_Unspecified) {
-     run_results.display_report_aggregates_only =
--        (FLAGS_benchmark_report_aggregates_only ||
--         FLAGS_benchmark_display_aggregates_only);
-+        (b.aggregation_report_mode() &
-+         internal::ARM_DisplayReportAggregatesOnly);
-     run_results.file_report_aggregates_only =
--        FLAGS_benchmark_report_aggregates_only;
--    if (b.aggregation_report_mode != internal::ARM_Unspecified) {
--      run_results.display_report_aggregates_only =
--          (b.aggregation_report_mode &
--           internal::ARM_DisplayReportAggregatesOnly);
--      run_results.file_report_aggregates_only =
--          (b.aggregation_report_mode & internal::ARM_FileReportAggregatesOnly);
--    }
-+        (b.aggregation_report_mode() & internal::ARM_FileReportAggregatesOnly);
-+    BM_CHECK(FLAGS_benchmark_perf_counters.empty() ||
-+             perf_counters_measurement.IsValid())
-+        << "Perf counters were requested but could not be set up.";
-+  }
-+}
- 
--    for (int repetition_num = 0; repetition_num < repeats; repetition_num++) {
--      DoOneRepetition(repetition_num);
--    }
-+BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() {
-+  BM_VLOG(2) << "Running " << b.name().str() << " for " << iters << "\n";
- 
--    // Calculate additional statistics
--    run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
-+  std::unique_ptr<internal::ThreadManager> manager;
-+  manager.reset(new internal::ThreadManager(b.threads()));
- 
--    // Maybe calculate complexity report
--    if ((b.complexity != oNone) && b.last_benchmark_instance) {
--      auto additional_run_stats = ComputeBigO(complexity_reports);
--      run_results.aggregates_only.insert(run_results.aggregates_only.end(),
--                                         additional_run_stats.begin(),
--                                         additional_run_stats.end());
--      complexity_reports.clear();
--    }
-+  // Run all but one thread in separate threads
-+  for (std::size_t ti = 0; ti < pool.size(); ++ti) {
-+    pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti + 1),
-+                           manager.get(), perf_counters_measurement_ptr);
-   }
-+  // And run one thread here directly.
-+  // (If we were asked to run just one thread, we don't create new threads.)
-+  // Yes, we need to do this here *after* we start the separate threads.
-+  RunInThread(&b, iters, 0, manager.get(), perf_counters_measurement_ptr);
- 
--  RunResults&& get_results() { return std::move(run_results); }
-+  // The main thread has finished. Now let's wait for the other threads.
-+  manager->WaitForAllThreads();
-+  for (std::thread& thread : pool) thread.join();
- 
-- private:
--  RunResults run_results;
-+  IterationResults i;
-+  // Acquire the measurements/counters from the manager, UNDER THE LOCK!
-+  {
-+    MutexLock l(manager->GetBenchmarkMutex());
-+    i.results = manager->results;
-+  }
- 
--  const benchmark::internal::BenchmarkInstance& b;
--  std::vector<BenchmarkReporter::Run>& complexity_reports;
-+  // And get rid of the manager.
-+  manager.reset();
- 
--  const double min_time;
--  const int repeats;
--  const bool has_explicit_iteration_count;
-+  // Adjust real/manual time stats since they were reported per thread.
-+  i.results.real_time_used /= b.threads();
-+  i.results.manual_time_used /= b.threads();
-+  // If we were measuring whole-process CPU usage, adjust the CPU time too.
-+  if (b.measure_process_cpu_time()) i.results.cpu_time_used /= b.threads();
- 
--  std::vector<std::thread> pool;
-+  BM_VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
-+             << i.results.real_time_used << "\n";
- 
--  IterationCount iters;  // preserved between repetitions!
--  // So only the first repetition has to find/calculate it,
--  // the other repetitions will just use that precomputed iteration count.
-+  // By using KeepRunningBatch a benchmark can iterate more times than
-+  // requested, so take the iteration count from i.results.
-+  i.iters = i.results.iterations / b.threads();
- 
--  struct IterationResults {
--    internal::ThreadManager::Result results;
--    IterationCount iters;
--    double seconds;
--  };
--  IterationResults DoNIterations() {
--    VLOG(2) << "Running " << b.name.str() << " for " << iters << "\n";
-+  // Base decisions off of real time if requested by this benchmark.
-+  i.seconds = i.results.cpu_time_used;
-+  if (b.use_manual_time()) {
-+    i.seconds = i.results.manual_time_used;
-+  } else if (b.use_real_time()) {
-+    i.seconds = i.results.real_time_used;
-+  }
- 
--    std::unique_ptr<internal::ThreadManager> manager;
--    manager.reset(new internal::ThreadManager(b.threads));
-+  return i;
-+}
- 
--    // Run all but one thread in separate threads
--    for (std::size_t ti = 0; ti < pool.size(); ++ti) {
--      pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti + 1),
--                             manager.get());
--    }
--    // And run one thread here directly.
--    // (If we were asked to run just one thread, we don't create new threads.)
--    // Yes, we need to do this here *after* we start the separate threads.
--    RunInThread(&b, iters, 0, manager.get());
-+IterationCount BenchmarkRunner::PredictNumItersNeeded(
-+    const IterationResults& i) const {
-+  // See how much iterations should be increased by.
-+  // Note: Avoid division by zero with max(seconds, 1ns).
-+  double multiplier = min_time * 1.4 / std::max(i.seconds, 1e-9);
-+  // If our last run was at least 10% of FLAGS_benchmark_min_time then we
-+  // use the multiplier directly.
-+  // Otherwise we use at most 10 times expansion.
-+  // NOTE: When the last run was at least 10% of the min time the max
-+  // expansion should be 14x.
-+  bool is_significant = (i.seconds / min_time) > 0.1;
-+  multiplier = is_significant ? multiplier : 10.0;
-+
-+  // So what seems to be the sufficiently-large iteration count? Round up.
-+  const IterationCount max_next_iters = static_cast<IterationCount>(
-+      std::lround(std::max(multiplier * static_cast<double>(i.iters),
-+                           static_cast<double>(i.iters) + 1.0)));
-+  // But we do have *some* sanity limits though..
-+  const IterationCount next_iters = std::min(max_next_iters, kMaxIterations);
-+
-+  BM_VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
-+  return next_iters;  // round up before conversion to integer.
-+}
- 
--    // The main thread has finished. Now let's wait for the other threads.
--    manager->WaitForAllThreads();
--    for (std::thread& thread : pool) thread.join();
-+bool BenchmarkRunner::ShouldReportIterationResults(
-+    const IterationResults& i) const {
-+  // Determine if this run should be reported;
-+  // Either it has run for a sufficient amount of time
-+  // or because an error was reported.
-+  return i.results.has_error_ ||
-+         i.iters >= kMaxIterations ||  // Too many iterations already.
-+         i.seconds >= min_time ||      // The elapsed time is large enough.
-+         // CPU time is specified but the elapsed real time greatly exceeds
-+         // the minimum time.
-+         // Note that user provided timers are except from this sanity check.
-+         ((i.results.real_time_used >= 5 * min_time) && !b.use_manual_time());
-+}
- 
--    IterationResults i;
--    // Acquire the measurements/counters from the manager, UNDER THE LOCK!
--    {
--      MutexLock l(manager->GetBenchmarkMutex());
--      i.results = manager->results;
--    }
-+void BenchmarkRunner::DoOneRepetition() {
-+  assert(HasRepeatsRemaining() && "Already done all repetitions?");
-+
-+  const bool is_the_first_repetition = num_repetitions_done == 0;
-+  IterationResults i;
-+
-+  // We *may* be gradually increasing the length (iteration count)
-+  // of the benchmark until we decide the results are significant.
-+  // And once we do, we report those last results and exit.
-+  // Please do note that the if there are repetitions, the iteration count
-+  // is *only* calculated for the *first* repetition, and other repetitions
-+  // simply use that precomputed iteration count.
-+  for (;;) {
-+    i = DoNIterations();
-+
-+    // Do we consider the results to be significant?
-+    // If we are doing repetitions, and the first repetition was already done,
-+    // it has calculated the correct iteration time, so we have run that very
-+    // iteration count just now. No need to calculate anything. Just report.
-+    // Else, the normal rules apply.
-+    const bool results_are_significant = !is_the_first_repetition ||
-+                                         has_explicit_iteration_count ||
-+                                         ShouldReportIterationResults(i);
-+
-+    if (results_are_significant) break;  // Good, let's report them!
-+
-+    // Nope, bad iteration. Let's re-estimate the hopefully-sufficient
-+    // iteration count, and run the benchmark again...
-+
-+    iters = PredictNumItersNeeded(i);
-+    assert(iters > i.iters &&
-+           "if we did more iterations than we want to do the next time, "
-+           "then we should have accepted the current iteration run.");
-+  }
- 
--    // And get rid of the manager.
-+  // Oh, one last thing, we need to also produce the 'memory measurements'..
-+  MemoryManager::Result memory_result;
-+  IterationCount memory_iterations = 0;
-+  if (memory_manager != nullptr) {
-+    // Only run a few iterations to reduce the impact of one-time
-+    // allocations in benchmarks that are not properly managed.
-+    memory_iterations = std::min<IterationCount>(16, iters);
-+    memory_manager->Start();
-+    std::unique_ptr<internal::ThreadManager> manager;
-+    manager.reset(new internal::ThreadManager(1));
-+    RunInThread(&b, memory_iterations, 0, manager.get(),
-+                perf_counters_measurement_ptr);
-+    manager->WaitForAllThreads();
-     manager.reset();
- 
--    // Adjust real/manual time stats since they were reported per thread.
--    i.results.real_time_used /= b.threads;
--    i.results.manual_time_used /= b.threads;
--    // If we were measuring whole-process CPU usage, adjust the CPU time too.
--    if (b.measure_process_cpu_time) i.results.cpu_time_used /= b.threads;
--
--    VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
--            << i.results.real_time_used << "\n";
--
--    // So for how long were we running?
--    i.iters = iters;
--    // Base decisions off of real time if requested by this benchmark.
--    i.seconds = i.results.cpu_time_used;
--    if (b.use_manual_time) {
--      i.seconds = i.results.manual_time_used;
--    } else if (b.use_real_time) {
--      i.seconds = i.results.real_time_used;
--    }
--
--    return i;
-+    memory_manager->Stop(&memory_result);
-   }
- 
--  IterationCount PredictNumItersNeeded(const IterationResults& i) const {
--    // See how much iterations should be increased by.
--    // Note: Avoid division by zero with max(seconds, 1ns).
--    double multiplier = min_time * 1.4 / std::max(i.seconds, 1e-9);
--    // If our last run was at least 10% of FLAGS_benchmark_min_time then we
--    // use the multiplier directly.
--    // Otherwise we use at most 10 times expansion.
--    // NOTE: When the last run was at least 10% of the min time the max
--    // expansion should be 14x.
--    bool is_significant = (i.seconds / min_time) > 0.1;
--    multiplier = is_significant ? multiplier : std::min(10.0, multiplier);
--    if (multiplier <= 1.0) multiplier = 2.0;
--
--    // So what seems to be the sufficiently-large iteration count? Round up.
--    const IterationCount max_next_iters =
--        std::lround(std::max(multiplier * i.iters, i.iters + 1.0));
--    // But we do have *some* sanity limits though..
--    const IterationCount next_iters = std::min(max_next_iters, kMaxIterations);
--
--    VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
--    return next_iters;  // round up before conversion to integer.
--  }
-+  // Ok, now actually report.
-+  BenchmarkReporter::Run report =
-+      CreateRunReport(b, i.results, memory_iterations, memory_result, i.seconds,
-+                      num_repetitions_done, repeats);
- 
--  bool ShouldReportIterationResults(const IterationResults& i) const {
--    // Determine if this run should be reported;
--    // Either it has run for a sufficient amount of time
--    // or because an error was reported.
--    return i.results.has_error_ ||
--           i.iters >= kMaxIterations ||  // Too many iterations already.
--           i.seconds >= min_time ||      // The elapsed time is large enough.
--           // CPU time is specified but the elapsed real time greatly exceeds
--           // the minimum time.
--           // Note that user provided timers are except from this sanity check.
--           ((i.results.real_time_used >= 5 * min_time) && !b.use_manual_time);
-+  if (reports_for_family) {
-+    ++reports_for_family->num_runs_done;
-+    if (!report.error_occurred) reports_for_family->Runs.push_back(report);
-   }
- 
--  void DoOneRepetition(int64_t repetition_index) {
--    const bool is_the_first_repetition = repetition_index == 0;
--    IterationResults i;
--
--    // We *may* be gradually increasing the length (iteration count)
--    // of the benchmark until we decide the results are significant.
--    // And once we do, we report those last results and exit.
--    // Please do note that the if there are repetitions, the iteration count
--    // is *only* calculated for the *first* repetition, and other repetitions
--    // simply use that precomputed iteration count.
--    for (;;) {
--      i = DoNIterations();
--
--      // Do we consider the results to be significant?
--      // If we are doing repetitions, and the first repetition was already done,
--      // it has calculated the correct iteration time, so we have run that very
--      // iteration count just now. No need to calculate anything. Just report.
--      // Else, the normal rules apply.
--      const bool results_are_significant = !is_the_first_repetition ||
--                                           has_explicit_iteration_count ||
--                                           ShouldReportIterationResults(i);
--
--      if (results_are_significant) break;  // Good, let's report them!
--
--      // Nope, bad iteration. Let's re-estimate the hopefully-sufficient
--      // iteration count, and run the benchmark again...
--
--      iters = PredictNumItersNeeded(i);
--      assert(iters > i.iters &&
--             "if we did more iterations than we want to do the next time, "
--             "then we should have accepted the current iteration run.");
--    }
--
--    // Oh, one last thing, we need to also produce the 'memory measurements'..
--    MemoryManager::Result memory_result;
--    IterationCount memory_iterations = 0;
--    if (memory_manager != nullptr) {
--      // Only run a few iterations to reduce the impact of one-time
--      // allocations in benchmarks that are not properly managed.
--      memory_iterations = std::min<IterationCount>(16, iters);
--      memory_manager->Start();
--      std::unique_ptr<internal::ThreadManager> manager;
--      manager.reset(new internal::ThreadManager(1));
--      RunInThread(&b, memory_iterations, 0, manager.get());
--      manager->WaitForAllThreads();
--      manager.reset();
--
--      memory_manager->Stop(&memory_result);
--    }
--
--    // Ok, now actualy report.
--    BenchmarkReporter::Run report =
--        CreateRunReport(b, i.results, memory_iterations, memory_result,
--                        i.seconds, repetition_index);
-+  run_results.non_aggregates.push_back(report);
- 
--    if (!report.error_occurred && b.complexity != oNone)
--      complexity_reports.push_back(report);
-+  ++num_repetitions_done;
-+}
- 
--    run_results.non_aggregates.push_back(report);
--  }
--};
-+RunResults&& BenchmarkRunner::GetResults() {
-+  assert(!HasRepeatsRemaining() && "Did not run all repetitions yet?");
- 
--}  // end namespace
-+  // Calculate additional statistics over the repetitions of this instance.
-+  run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
- 
--RunResults RunBenchmark(
--    const benchmark::internal::BenchmarkInstance& b,
--    std::vector<BenchmarkReporter::Run>* complexity_reports) {
--  internal::BenchmarkRunner r(b, complexity_reports);
--  return r.get_results();
-+  return std::move(run_results);
- }
- 
- }  // end namespace internal
-diff --git a/lib/benchmark/src/benchmark_runner.h b/lib/benchmark/src/benchmark_runner.h
-index 96e8282a11..8427ce6a26 100644
---- a/lib/benchmark/src/benchmark_runner.h
-+++ b/lib/benchmark/src/benchmark_runner.h
-@@ -15,19 +15,22 @@
- #ifndef BENCHMARK_RUNNER_H_
- #define BENCHMARK_RUNNER_H_
- 
-+#include <thread>
-+#include <vector>
-+
- #include "benchmark_api_internal.h"
- #include "internal_macros.h"
--
--DECLARE_double(benchmark_min_time);
--
--DECLARE_int32(benchmark_repetitions);
--
--DECLARE_bool(benchmark_report_aggregates_only);
--
--DECLARE_bool(benchmark_display_aggregates_only);
-+#include "perf_counters.h"
-+#include "thread_manager.h"
- 
- namespace benchmark {
- 
-+BM_DECLARE_double(benchmark_min_time);
-+BM_DECLARE_int32(benchmark_repetitions);
-+BM_DECLARE_bool(benchmark_report_aggregates_only);
-+BM_DECLARE_bool(benchmark_display_aggregates_only);
-+BM_DECLARE_string(benchmark_perf_counters);
-+
- namespace internal {
- 
- extern MemoryManager* memory_manager;
-@@ -40,9 +43,57 @@ struct RunResults {
-   bool file_report_aggregates_only = false;
- };
- 
--RunResults RunBenchmark(
--    const benchmark::internal::BenchmarkInstance& b,
--    std::vector<BenchmarkReporter::Run>* complexity_reports);
-+class BenchmarkRunner {
-+ public:
-+  BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
-+                  BenchmarkReporter::PerFamilyRunReports* reports_for_family);
-+
-+  int GetNumRepeats() const { return repeats; }
-+
-+  bool HasRepeatsRemaining() const {
-+    return GetNumRepeats() != num_repetitions_done;
-+  }
-+
-+  void DoOneRepetition();
-+
-+  RunResults&& GetResults();
-+
-+  BenchmarkReporter::PerFamilyRunReports* GetReportsForFamily() const {
-+    return reports_for_family;
-+  }
-+
-+ private:
-+  RunResults run_results;
-+
-+  const benchmark::internal::BenchmarkInstance& b;
-+  BenchmarkReporter::PerFamilyRunReports* reports_for_family;
-+
-+  const double min_time;
-+  const int repeats;
-+  const bool has_explicit_iteration_count;
-+
-+  int num_repetitions_done = 0;
-+
-+  std::vector<std::thread> pool;
-+
-+  IterationCount iters;  // preserved between repetitions!
-+  // So only the first repetition has to find/calculate it,
-+  // the other repetitions will just use that precomputed iteration count.
-+
-+  PerfCountersMeasurement perf_counters_measurement;
-+  PerfCountersMeasurement* const perf_counters_measurement_ptr;
-+
-+  struct IterationResults {
-+    internal::ThreadManager::Result results;
-+    IterationCount iters;
-+    double seconds;
-+  };
-+  IterationResults DoNIterations();
-+
-+  IterationCount PredictNumItersNeeded(const IterationResults& i) const;
-+
-+  bool ShouldReportIterationResults(const IterationResults& i) const;
-+};
- 
- }  // namespace internal
- 
-diff --git a/lib/benchmark/src/check.h b/lib/benchmark/src/check.h
-index f5f8253f80..0efd13ff4d 100644
---- a/lib/benchmark/src/check.h
-+++ b/lib/benchmark/src/check.h
-@@ -23,8 +23,9 @@ BENCHMARK_NORETURN inline void CallAbortHandler() {
-   std::abort();  // fallback to enforce noreturn
- }
- 
--// CheckHandler is the class constructed by failing CHECK macros. CheckHandler
--// will log information about the failures and abort when it is destructed.
-+// CheckHandler is the class constructed by failing BM_CHECK macros.
-+// CheckHandler will log information about the failures and abort when it is
-+// destructed.
- class CheckHandler {
-  public:
-   CheckHandler(const char* check, const char* file, const char* func, int line)
-@@ -51,32 +52,32 @@ class CheckHandler {
- }  // end namespace internal
- }  // end namespace benchmark
- 
--// The CHECK macro returns a std::ostream object that can have extra information
--// written to it.
-+// The BM_CHECK macro returns a std::ostream object that can have extra
-+// information written to it.
- #ifndef NDEBUG
--#define CHECK(b)                                                             \
-+#define BM_CHECK(b)                                                          \
-   (b ? ::benchmark::internal::GetNullLogInstance()                           \
-      : ::benchmark::internal::CheckHandler(#b, __FILE__, __func__, __LINE__) \
-            .GetLog())
- #else
--#define CHECK(b) ::benchmark::internal::GetNullLogInstance()
-+#define BM_CHECK(b) ::benchmark::internal::GetNullLogInstance()
- #endif
- 
- // clang-format off
- // preserve whitespacing between operators for alignment
--#define CHECK_EQ(a, b) CHECK((a) == (b))
--#define CHECK_NE(a, b) CHECK((a) != (b))
--#define CHECK_GE(a, b) CHECK((a) >= (b))
--#define CHECK_LE(a, b) CHECK((a) <= (b))
--#define CHECK_GT(a, b) CHECK((a) > (b))
--#define CHECK_LT(a, b) CHECK((a) < (b))
--
--#define CHECK_FLOAT_EQ(a, b, eps) CHECK(std::fabs((a) - (b)) <  (eps))
--#define CHECK_FLOAT_NE(a, b, eps) CHECK(std::fabs((a) - (b)) >= (eps))
--#define CHECK_FLOAT_GE(a, b, eps) CHECK((a) - (b) > -(eps))
--#define CHECK_FLOAT_LE(a, b, eps) CHECK((b) - (a) > -(eps))
--#define CHECK_FLOAT_GT(a, b, eps) CHECK((a) - (b) >  (eps))
--#define CHECK_FLOAT_LT(a, b, eps) CHECK((b) - (a) >  (eps))
-+#define BM_CHECK_EQ(a, b) BM_CHECK((a) == (b))
-+#define BM_CHECK_NE(a, b) BM_CHECK((a) != (b))
-+#define BM_CHECK_GE(a, b) BM_CHECK((a) >= (b))
-+#define BM_CHECK_LE(a, b) BM_CHECK((a) <= (b))
-+#define BM_CHECK_GT(a, b) BM_CHECK((a) > (b))
-+#define BM_CHECK_LT(a, b) BM_CHECK((a) < (b))
-+
-+#define BM_CHECK_FLOAT_EQ(a, b, eps) BM_CHECK(std::fabs((a) - (b)) <  (eps))
-+#define BM_CHECK_FLOAT_NE(a, b, eps) BM_CHECK(std::fabs((a) - (b)) >= (eps))
-+#define BM_CHECK_FLOAT_GE(a, b, eps) BM_CHECK((a) - (b) > -(eps))
-+#define BM_CHECK_FLOAT_LE(a, b, eps) BM_CHECK((b) - (a) > -(eps))
-+#define BM_CHECK_FLOAT_GT(a, b, eps) BM_CHECK((a) - (b) >  (eps))
-+#define BM_CHECK_FLOAT_LT(a, b, eps) BM_CHECK((b) - (a) >  (eps))
- //clang-format on
- 
- #endif  // CHECK_H_
-diff --git a/lib/benchmark/src/colorprint.cc b/lib/benchmark/src/colorprint.cc
-index fff6a98818..afaa55dd54 100644
---- a/lib/benchmark/src/colorprint.cc
-+++ b/lib/benchmark/src/colorprint.cc
-@@ -94,7 +94,7 @@ std::string FormatString(const char* msg, va_list args) {
-   va_end(args_cp);
- 
-   // currently there is no error handling for failure, so this is hack.
--  CHECK(ret >= 0);
-+  BM_CHECK(ret >= 0);
- 
-   if (ret == 0)  // handle empty expansion
-     return {};
-@@ -105,7 +105,7 @@ std::string FormatString(const char* msg, va_list args) {
-     size = (size_t)ret + 1;  // + 1 for the null byte
-     std::unique_ptr<char[]> buff(new char[size]);
-     ret = vsnprintf(buff.get(), size, msg, args);
--    CHECK(ret > 0 && ((size_t)ret) < size);
-+    BM_CHECK(ret > 0 && ((size_t)ret) < size);
-     return buff.get();
-   }
- }
-diff --git a/lib/benchmark/src/commandlineflags.cc b/lib/benchmark/src/commandlineflags.cc
-index 3380a127a8..5724aaa294 100644
---- a/lib/benchmark/src/commandlineflags.cc
-+++ b/lib/benchmark/src/commandlineflags.cc
-@@ -20,6 +20,10 @@
- #include <cstring>
- #include <iostream>
- #include <limits>
-+#include <map>
-+#include <utility>
-+
-+#include "../src/string_util.h"
- 
- namespace benchmark {
- namespace {
-@@ -78,6 +82,30 @@ bool ParseDouble(const std::string& src_text, const char* str, double* value) {
-   return true;
- }
- 
-+// Parses 'str' into KV pairs. If successful, writes the result to *value and
-+// returns true; otherwise leaves *value unchanged and returns false.
-+bool ParseKvPairs(const std::string& src_text, const char* str,
-+                  std::map<std::string, std::string>* value) {
-+  std::map<std::string, std::string> kvs;
-+  for (const auto& kvpair : StrSplit(str, ',')) {
-+    const auto kv = StrSplit(kvpair, '=');
-+    if (kv.size() != 2) {
-+      std::cerr << src_text << " is expected to be a comma-separated list of "
-+                << "<key>=<value> strings, but actually has value \"" << str
-+                << "\".\n";
-+      return false;
-+    }
-+    if (!kvs.emplace(kv[0], kv[1]).second) {
-+      std::cerr << src_text << " is expected to contain unique keys but key \""
-+                << kv[0] << "\" was repeated.\n";
-+      return false;
-+    }
-+  }
-+
-+  *value = kvs;
-+  return true;
-+}
-+
- // Returns the name of the environment variable corresponding to the
- // given flag.  For example, FlagToEnvVar("foo") will return
- // "BENCHMARK_FOO" in the open-source version.
-@@ -88,7 +116,7 @@ static std::string FlagToEnvVar(const char* flag) {
-   for (size_t i = 0; i != flag_str.length(); ++i)
-     env_var += static_cast<char>(::toupper(flag_str.c_str()[i]));
- 
--  return "BENCHMARK_" + env_var;
-+  return env_var;
- }
- 
- }  // namespace
-@@ -129,6 +157,20 @@ const char* StringFromEnv(const char* flag, const char* default_val) {
-   return value == nullptr ? default_val : value;
- }
- 
-+std::map<std::string, std::string> KvPairsFromEnv(
-+    const char* flag, std::map<std::string, std::string> default_val) {
-+  const std::string env_var = FlagToEnvVar(flag);
-+  const char* const value_str = getenv(env_var.c_str());
-+
-+  if (value_str == nullptr) return default_val;
-+
-+  std::map<std::string, std::string> value;
-+  if (!ParseKvPairs("Environment variable " + env_var, value_str, &value)) {
-+    return default_val;
-+  }
-+  return value;
-+}
-+
- // Parses a string as a command line flag.  The string should have
- // the format "--flag=value".  When def_optional is true, the "=value"
- // part can be omitted.
-@@ -206,6 +248,22 @@ bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
-   return true;
- }
- 
-+bool ParseKeyValueFlag(
-+    const char* str, const char* flag,
-+    std::map<std::string, std::string>* value) {
-+  const char* const value_str = ParseFlagValue(str, flag, false);
-+
-+  if (value_str == nullptr) return false;
-+
-+  for (const auto& kvpair : StrSplit(value_str, ',')) {
-+    const auto kv = StrSplit(kvpair, '=');
-+    if (kv.size() != 2) return false;
-+    value->emplace(kv[0], kv[1]);
-+  }
-+
-+  return true;
-+}
-+
- bool IsFlag(const char* str, const char* flag) {
-   return (ParseFlagValue(str, flag, true) != nullptr);
- }
-diff --git a/lib/benchmark/src/commandlineflags.h b/lib/benchmark/src/commandlineflags.h
-index 3a1f6a8dbc..5baaf11784 100644
---- a/lib/benchmark/src/commandlineflags.h
-+++ b/lib/benchmark/src/commandlineflags.h
-@@ -2,61 +2,70 @@
- #define BENCHMARK_COMMANDLINEFLAGS_H_
- 
- #include <cstdint>
-+#include <map>
- #include <string>
- 
- // Macro for referencing flags.
- #define FLAG(name) FLAGS_##name
- 
- // Macros for declaring flags.
--#define DECLARE_bool(name) extern bool FLAG(name)
--#define DECLARE_int32(name) extern int32_t FLAG(name)
--#define DECLARE_double(name) extern double FLAG(name)
--#define DECLARE_string(name) extern std::string FLAG(name)
-+#define BM_DECLARE_bool(name) extern bool FLAG(name)
-+#define BM_DECLARE_int32(name) extern int32_t FLAG(name)
-+#define BM_DECLARE_double(name) extern double FLAG(name)
-+#define BM_DECLARE_string(name) extern std::string FLAG(name)
-+#define BM_DECLARE_kvpairs(name) \
-+  extern std::map<std::string, std::string> FLAG(name)
- 
- // Macros for defining flags.
--#define DEFINE_bool(name, default_val)            \
--  bool FLAG(name) =                               \
--    benchmark::BoolFromEnv(#name, default_val)
--#define DEFINE_int32(name, default_val)           \
--  int32_t FLAG(name) =                            \
--    benchmark::Int32FromEnv(#name, default_val)
--#define DEFINE_double(name, default_val)          \
--  double FLAG(name) =                             \
--    benchmark::DoubleFromEnv(#name, default_val)
--#define DEFINE_string(name, default_val)          \
--  std::string FLAG(name) =                        \
--    benchmark::StringFromEnv(#name, default_val)
-+#define BM_DEFINE_bool(name, default_val) \
-+  bool FLAG(name) = benchmark::BoolFromEnv(#name, default_val)
-+#define BM_DEFINE_int32(name, default_val) \
-+  int32_t FLAG(name) = benchmark::Int32FromEnv(#name, default_val)
-+#define BM_DEFINE_double(name, default_val) \
-+  double FLAG(name) = benchmark::DoubleFromEnv(#name, default_val)
-+#define BM_DEFINE_string(name, default_val) \
-+  std::string FLAG(name) = benchmark::StringFromEnv(#name, default_val)
-+#define BM_DEFINE_kvpairs(name, default_val)      \
-+  std::map<std::string, std::string> FLAG(name) = \
-+      benchmark::KvPairsFromEnv(#name, default_val)
- 
- namespace benchmark {
- 
--// Parses a bool from the environment variable
--// corresponding to the given flag.
-+// Parses a bool from the environment variable corresponding to the given flag.
- //
- // If the variable exists, returns IsTruthyFlagValue() value;  if not,
- // returns the given default value.
- bool BoolFromEnv(const char* flag, bool default_val);
- 
--// Parses an Int32 from the environment variable
--// corresponding to the given flag.
-+// Parses an Int32 from the environment variable corresponding to the given
-+// flag.
- //
- // If the variable exists, returns ParseInt32() value;  if not, returns
- // the given default value.
- int32_t Int32FromEnv(const char* flag, int32_t default_val);
- 
--// Parses an Double from the environment variable
--// corresponding to the given flag.
-+// Parses an Double from the environment variable corresponding to the given
-+// flag.
- //
- // If the variable exists, returns ParseDouble();  if not, returns
- // the given default value.
- double DoubleFromEnv(const char* flag, double default_val);
- 
--// Parses a string from the environment variable
--// corresponding to the given flag.
-+// Parses a string from the environment variable corresponding to the given
-+// flag.
- //
- // If variable exists, returns its value;  if not, returns
- // the given default value.
- const char* StringFromEnv(const char* flag, const char* default_val);
- 
-+// Parses a set of kvpairs from the environment variable corresponding to the
-+// given flag.
-+//
-+// If variable exists, returns its value;  if not, returns
-+// the given default value.
-+std::map<std::string, std::string> KvPairsFromEnv(
-+    const char* flag, std::map<std::string, std::string> default_val);
-+
- // Parses a string for a bool flag, in the form of either
- // "--flag=value" or "--flag".
- //
-@@ -68,27 +77,31 @@ const char* StringFromEnv(const char* flag, const char* default_val);
- // true.  On failure, returns false without changing *value.
- bool ParseBoolFlag(const char* str, const char* flag, bool* value);
- 
--// Parses a string for an Int32 flag, in the form of
--// "--flag=value".
-+// Parses a string for an Int32 flag, in the form of "--flag=value".
- //
- // On success, stores the value of the flag in *value, and returns
- // true.  On failure, returns false without changing *value.
- bool ParseInt32Flag(const char* str, const char* flag, int32_t* value);
- 
--// Parses a string for a Double flag, in the form of
--// "--flag=value".
-+// Parses a string for a Double flag, in the form of "--flag=value".
- //
- // On success, stores the value of the flag in *value, and returns
- // true.  On failure, returns false without changing *value.
- bool ParseDoubleFlag(const char* str, const char* flag, double* value);
- 
--// Parses a string for a string flag, in the form of
--// "--flag=value".
-+// Parses a string for a string flag, in the form of "--flag=value".
- //
- // On success, stores the value of the flag in *value, and returns
- // true.  On failure, returns false without changing *value.
- bool ParseStringFlag(const char* str, const char* flag, std::string* value);
- 
-+// Parses a string for a kvpairs flag in the form "--flag=key=value,key=value"
-+//
-+// On success, stores the value of the flag in *value and returns true. On
-+// failure returns false, though *value may have been mutated.
-+bool ParseKeyValueFlag(const char* str, const char* flag,
-+                       std::map<std::string, std::string>* value);
-+
- // Returns true if the string matches the flag.
- bool IsFlag(const char* str, const char* flag);
- 
-diff --git a/lib/benchmark/src/complexity.cc b/lib/benchmark/src/complexity.cc
-index aeed67f0c7..b87697fb94 100644
---- a/lib/benchmark/src/complexity.cc
-+++ b/lib/benchmark/src/complexity.cc
-@@ -82,7 +82,6 @@ std::string GetBigOString(BigO complexity) {
- LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
-                        const std::vector<double>& time,
-                        BigOFunc* fitting_curve) {
--  double sigma_gn = 0.0;
-   double sigma_gn_squared = 0.0;
-   double sigma_time = 0.0;
-   double sigma_time_gn = 0.0;
-@@ -90,7 +89,6 @@ LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
-   // Calculate least square fitting parameter
-   for (size_t i = 0; i < n.size(); ++i) {
-     double gn_i = fitting_curve(n[i]);
--    sigma_gn += gn_i;
-     sigma_gn_squared += gn_i * gn_i;
-     sigma_time += time[i];
-     sigma_time_gn += time[i] * gn_i;
-@@ -125,10 +123,10 @@ LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
- //                  fitting curve.
- LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
-                        const std::vector<double>& time, const BigO complexity) {
--  CHECK_EQ(n.size(), time.size());
--  CHECK_GE(n.size(), 2);  // Do not compute fitting curve is less than two
--                          // benchmark runs are given
--  CHECK_NE(complexity, oNone);
-+  BM_CHECK_EQ(n.size(), time.size());
-+  BM_CHECK_GE(n.size(), 2);  // Do not compute fitting curve is less than two
-+                             // benchmark runs are given
-+  BM_CHECK_NE(complexity, oNone);
- 
-   LeastSq best_fit;
- 
-@@ -169,7 +167,8 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
- 
-   // Populate the accumulators.
-   for (const Run& run : reports) {
--    CHECK_GT(run.complexity_n, 0) << "Did you forget to call SetComplexityN?";
-+    BM_CHECK_GT(run.complexity_n, 0)
-+        << "Did you forget to call SetComplexityN?";
-     n.push_back(run.complexity_n);
-     real_time.push_back(run.real_accumulated_time / run.iterations);
-     cpu_time.push_back(run.cpu_accumulated_time / run.iterations);
-@@ -193,11 +192,14 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
-   // Get the data from the accumulator to BenchmarkReporter::Run's.
-   Run big_o;
-   big_o.run_name = run_name;
-+  big_o.family_index = reports[0].family_index;
-+  big_o.per_family_instance_index = reports[0].per_family_instance_index;
-   big_o.run_type = BenchmarkReporter::Run::RT_Aggregate;
-   big_o.repetitions = reports[0].repetitions;
-   big_o.repetition_index = Run::no_repetition_index;
-   big_o.threads = reports[0].threads;
-   big_o.aggregate_name = "BigO";
-+  big_o.aggregate_unit = StatisticUnit::kTime;
-   big_o.report_label = reports[0].report_label;
-   big_o.iterations = 0;
-   big_o.real_accumulated_time = result_real.coef;
-@@ -215,8 +217,11 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
-   // Only add label to mean/stddev if it is same for all runs
-   Run rms;
-   rms.run_name = run_name;
-+  rms.family_index = reports[0].family_index;
-+  rms.per_family_instance_index = reports[0].per_family_instance_index;
-   rms.run_type = BenchmarkReporter::Run::RT_Aggregate;
-   rms.aggregate_name = "RMS";
-+  rms.aggregate_unit = StatisticUnit::kPercentage;
-   rms.report_label = big_o.report_label;
-   rms.iterations = 0;
-   rms.repetition_index = Run::no_repetition_index;
-diff --git a/lib/benchmark/src/console_reporter.cc b/lib/benchmark/src/console_reporter.cc
-index 6fd764525e..61c34da822 100644
---- a/lib/benchmark/src/console_reporter.cc
-+++ b/lib/benchmark/src/console_reporter.cc
-@@ -142,10 +142,16 @@ void ConsoleReporter::PrintRunData(const Run& result) {
-   } else if (result.report_rms) {
-     printer(Out, COLOR_YELLOW, "%10.0f %-4s %10.0f %-4s ", real_time * 100, "%",
-             cpu_time * 100, "%");
--  } else {
-+  } else if (result.run_type != Run::RT_Aggregate ||
-+             result.aggregate_unit == StatisticUnit::kTime) {
-     const char* timeLabel = GetTimeUnitString(result.time_unit);
-     printer(Out, COLOR_YELLOW, "%s %-4s %s %-4s ", real_time_str.c_str(), timeLabel,
-             cpu_time_str.c_str(), timeLabel);
-+  } else {
-+    assert(result.aggregate_unit == StatisticUnit::kPercentage);
-+    printer(Out, COLOR_YELLOW, "%10.2f %-4s %10.2f %-4s ",
-+            (100. * result.real_accumulated_time), "%",
-+            (100. * result.cpu_accumulated_time), "%");
-   }
- 
-   if (!result.report_big_o && !result.report_rms) {
-@@ -155,10 +161,17 @@ void ConsoleReporter::PrintRunData(const Run& result) {
-   for (auto& c : result.counters) {
-     const std::size_t cNameLen = std::max(std::string::size_type(10),
-                                           c.first.length());
--    auto const& s = HumanReadableNumber(c.second.value, c.second.oneK);
-+    std::string s;
-     const char* unit = "";
--    if (c.second.flags & Counter::kIsRate)
--      unit = (c.second.flags & Counter::kInvert) ? "s" : "/s";
-+    if (result.run_type == Run::RT_Aggregate &&
-+        result.aggregate_unit == StatisticUnit::kPercentage) {
-+      s = StrFormat("%.2f", 100. * c.second.value);
-+      unit = "%";
-+    } else {
-+      s = HumanReadableNumber(c.second.value, c.second.oneK);
-+      if (c.second.flags & Counter::kIsRate)
-+        unit = (c.second.flags & Counter::kInvert) ? "s" : "/s";
-+    }
-     if (output_options_ & OO_Tabular) {
-       printer(Out, COLOR_DEFAULT, " %*s%s", cNameLen - strlen(unit), s.c_str(),
-               unit);
-diff --git a/lib/benchmark/src/csv_reporter.cc b/lib/benchmark/src/csv_reporter.cc
-index af2c18fc8a..9bd7121daf 100644
---- a/lib/benchmark/src/csv_reporter.cc
-+++ b/lib/benchmark/src/csv_reporter.cc
-@@ -85,7 +85,8 @@ void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
-       for (const auto& cnt : run.counters) {
-         if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second")
-           continue;
--        CHECK(user_counter_names_.find(cnt.first) != user_counter_names_.end())
-+        BM_CHECK(user_counter_names_.find(cnt.first) !=
-+                 user_counter_names_.end())
-             << "All counters must be present in each run. "
-             << "Counter named \"" << cnt.first
-             << "\" was not in a run after being added to the header";
-diff --git a/lib/benchmark/src/cycleclock.h b/lib/benchmark/src/cycleclock.h
-index d5d62c4c7f..f22ca9f7d2 100644
---- a/lib/benchmark/src/cycleclock.h
-+++ b/lib/benchmark/src/cycleclock.h
-@@ -36,7 +36,7 @@
- // declarations of some other intrinsics, breaking compilation.
- // Therefore, we simply declare __rdtsc ourselves. See also
- // http://connect.microsoft.com/VisualStudio/feedback/details/262047
--#if defined(COMPILER_MSVC) && !defined(_M_IX86)
-+#if defined(COMPILER_MSVC) && !defined(_M_IX86) && !defined(_M_ARM64)
- extern "C" uint64_t __rdtsc();
- #pragma intrinsic(__rdtsc)
- #endif
-@@ -84,13 +84,21 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
-   return (high << 32) | low;
- #elif defined(__powerpc__) || defined(__ppc__)
-   // This returns a time-base, which is not always precisely a cycle-count.
--  int64_t tbl, tbu0, tbu1;
--  asm("mftbu %0" : "=r"(tbu0));
--  asm("mftb  %0" : "=r"(tbl));
--  asm("mftbu %0" : "=r"(tbu1));
--  tbl &= -static_cast<int64_t>(tbu0 == tbu1);
--  // high 32 bits in tbu1; low 32 bits in tbl  (tbu0 is garbage)
--  return (tbu1 << 32) | tbl;
-+#if defined(__powerpc64__) || defined(__ppc64__)
-+  int64_t tb;
-+  asm volatile("mfspr %0, 268" : "=r"(tb));
-+  return tb;
-+#else
-+  uint32_t tbl, tbu0, tbu1;
-+  asm volatile(
-+      "mftbu %0\n"
-+      "mftb %1\n"
-+      "mftbu %2"
-+      : "=r"(tbu0), "=r"(tbl), "=r"(tbu1));
-+  tbl &= -static_cast<int32_t>(tbu0 == tbu1);
-+  // high 32 bits in tbu1; low 32 bits in tbl  (tbu0 is no longer needed)
-+  return (static_cast<uint64_t>(tbu1) << 32) | tbl;
-+#endif
- #elif defined(__sparc__)
-   int64_t tick;
-   asm(".byte 0x83, 0x41, 0x00, 0x00");
-@@ -106,6 +114,12 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
-   // when I know it will work.  Otherwise, I'll use __rdtsc and hope
-   // the code is being compiled with a non-ancient compiler.
-   _asm rdtsc
-+#elif defined(COMPILER_MSVC) && defined(_M_ARM64)
-+  // See https://docs.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics?view=vs-2019
-+  // and https://reviews.llvm.org/D53115
-+  int64_t virtual_timer_value;
-+  virtual_timer_value = _ReadStatusReg(ARM64_CNTVCT);
-+  return virtual_timer_value;
- #elif defined(COMPILER_MSVC)
-   return __rdtsc();
- #elif defined(BENCHMARK_OS_NACL)
-@@ -153,32 +167,51 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
-   struct timeval tv;
-   gettimeofday(&tv, nullptr);
-   return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
--#elif defined(__mips__)
-+#elif defined(__mips__) || defined(__m68k__)
-   // mips apparently only allows rdtsc for superusers, so we fall
-   // back to gettimeofday.  It's possible clock_gettime would be better.
-   struct timeval tv;
-   gettimeofday(&tv, nullptr);
-   return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
-+#elif defined(__loongarch__)
-+  struct timeval tv;
-+  gettimeofday(&tv, nullptr);
-+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
- #elif defined(__s390__)  // Covers both s390 and s390x.
-   // Return the CPU clock.
-   uint64_t tsc;
-+#if defined(BENCHMARK_OS_ZOS) && defined(COMPILER_IBMXL)
-+  // z/OS XL compiler HLASM syntax.
-+  asm(" stck %0" : "=m"(tsc) : : "cc");
-+#else
-   asm("stck %0" : "=Q"(tsc) : : "cc");
-+#endif
-   return tsc;
- #elif defined(__riscv) // RISC-V
-   // Use RDCYCLE (and RDCYCLEH on riscv32)
- #if __riscv_xlen == 32
--  uint64_t cycles_low, cycles_hi0, cycles_hi1;
--  asm("rdcycleh %0" : "=r"(cycles_hi0));
--  asm("rdcycle %0" : "=r"(cycles_lo));
--  asm("rdcycleh %0" : "=r"(cycles_hi1));
--  // This matches the PowerPC overflow detection, above
--  cycles_lo &= -static_cast<int64_t>(cycles_hi0 == cycles_hi1);
--  return (cycles_hi1 << 32) | cycles_lo;
-+  uint32_t cycles_lo, cycles_hi0, cycles_hi1;
-+  // This asm also includes the PowerPC overflow handling strategy, as above.
-+  // Implemented in assembly because Clang insisted on branching.
-+  asm volatile(
-+      "rdcycleh %0\n"
-+      "rdcycle %1\n"
-+      "rdcycleh %2\n"
-+      "sub %0, %0, %2\n"
-+      "seqz %0, %0\n"
-+      "sub %0, zero, %0\n"
-+      "and %1, %1, %0\n"
-+      : "=r"(cycles_hi0), "=r"(cycles_lo), "=r"(cycles_hi1));
-+  return (static_cast<uint64_t>(cycles_hi1) << 32) | cycles_lo;
- #else
-   uint64_t cycles;
--  asm("rdcycle %0" : "=r"(cycles));
-+  asm volatile("rdcycle %0" : "=r"(cycles));
-   return cycles;
- #endif
-+#elif defined(__e2k__) || defined(__elbrus__)
-+  struct timeval tv;
-+  gettimeofday(&tv, nullptr);
-+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
- #else
- // The soft failover to a generic implementation is automatic only for ARM.
- // For other platforms the developer is expected to make an attempt to create
-diff --git a/lib/benchmark/src/internal_macros.h b/lib/benchmark/src/internal_macros.h
-index 6adf00d056..91f367b894 100644
---- a/lib/benchmark/src/internal_macros.h
-+++ b/lib/benchmark/src/internal_macros.h
-@@ -13,7 +13,11 @@
- #endif
- 
- #if defined(__clang__)
--  #if !defined(COMPILER_CLANG)
-+  #if defined(__ibmxl__)
-+    #if !defined(COMPILER_IBMXL)
-+      #define COMPILER_IBMXL
-+    #endif
-+  #elif !defined(COMPILER_CLANG)
-     #define COMPILER_CLANG
-   #endif
- #elif defined(_MSC_VER)
-@@ -58,6 +62,8 @@
-   #define BENCHMARK_OS_NETBSD 1
- #elif defined(__OpenBSD__)
-   #define BENCHMARK_OS_OPENBSD 1
-+#elif defined(__DragonFly__)
-+  #define BENCHMARK_OS_DRAGONFLY 1
- #elif defined(__linux__)
-   #define BENCHMARK_OS_LINUX 1
- #elif defined(__native_client__)
-@@ -72,6 +78,8 @@
- #define BENCHMARK_OS_SOLARIS 1
- #elif defined(__QNX__)
- #define BENCHMARK_OS_QNX 1
-+#elif defined(__MVS__)
-+#define BENCHMARK_OS_ZOS 1
- #endif
- 
- #if defined(__ANDROID__) && defined(__GLIBCXX__)
-diff --git a/lib/benchmark/src/json_reporter.cc b/lib/benchmark/src/json_reporter.cc
-index e5f3c35248..22d5ce021c 100644
---- a/lib/benchmark/src/json_reporter.cc
-+++ b/lib/benchmark/src/json_reporter.cc
-@@ -12,9 +12,6 @@
- // See the License for the specific language governing permissions and
- // limitations under the License.
- 
--#include "benchmark/benchmark.h"
--#include "complexity.h"
--
- #include <algorithm>
- #include <cmath>
- #include <cstdint>
-@@ -25,41 +22,65 @@
- #include <tuple>
- #include <vector>
- 
-+#include "benchmark/benchmark.h"
-+#include "complexity.h"
- #include "string_util.h"
- #include "timers.h"
- 
- namespace benchmark {
-+namespace internal {
-+extern std::map<std::string, std::string>* global_context;
-+}
- 
- namespace {
- 
--std::string StrEscape(const std::string & s) {
-+std::string StrEscape(const std::string& s) {
-   std::string tmp;
-   tmp.reserve(s.size());
-   for (char c : s) {
-     switch (c) {
--    case '\b': tmp += "\\b"; break;
--    case '\f': tmp += "\\f"; break;
--    case '\n': tmp += "\\n"; break;
--    case '\r': tmp += "\\r"; break;
--    case '\t': tmp += "\\t"; break;
--    case '\\': tmp += "\\\\"; break;
--    case '"' : tmp += "\\\""; break;
--    default  : tmp += c; break;
-+      case '\b':
-+        tmp += "\\b";
-+        break;
-+      case '\f':
-+        tmp += "\\f";
-+        break;
-+      case '\n':
-+        tmp += "\\n";
-+        break;
-+      case '\r':
-+        tmp += "\\r";
-+        break;
-+      case '\t':
-+        tmp += "\\t";
-+        break;
-+      case '\\':
-+        tmp += "\\\\";
-+        break;
-+      case '"':
-+        tmp += "\\\"";
-+        break;
-+      default:
-+        tmp += c;
-+        break;
-     }
-   }
-   return tmp;
- }
- 
- std::string FormatKV(std::string const& key, std::string const& value) {
--  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(), StrEscape(value).c_str());
-+  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(),
-+                   StrEscape(value).c_str());
- }
- 
- std::string FormatKV(std::string const& key, const char* value) {
--  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(), StrEscape(value).c_str());
-+  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(),
-+                   StrEscape(value).c_str());
- }
- 
- std::string FormatKV(std::string const& key, bool value) {
--  return StrFormat("\"%s\": %s", StrEscape(key).c_str(), value ? "true" : "false");
-+  return StrFormat("\"%s\": %s", StrEscape(key).c_str(),
-+                   value ? "true" : "false");
- }
- 
- std::string FormatKV(std::string const& key, int64_t value) {
-@@ -122,8 +143,12 @@ bool JSONReporter::ReportContext(const Context& context) {
-       << FormatKV("mhz_per_cpu",
-                   RoundDouble(info.cycles_per_second / 1000000.0))
-       << ",\n";
--  out << indent << FormatKV("cpu_scaling_enabled", info.scaling_enabled)
--      << ",\n";
-+  if (CPUInfo::Scaling::UNKNOWN != info.scaling) {
-+    out << indent
-+        << FormatKV("cpu_scaling_enabled",
-+                    info.scaling == CPUInfo::Scaling::ENABLED ? true : false)
-+        << ",\n";
-+  }
- 
-   out << indent << "\"caches\": [\n";
-   indent = std::string(6, ' ');
-@@ -134,8 +159,8 @@ bool JSONReporter::ReportContext(const Context& context) {
-     out << cache_indent << FormatKV("type", CI.type) << ",\n";
-     out << cache_indent << FormatKV("level", static_cast<int64_t>(CI.level))
-         << ",\n";
--    out << cache_indent
--        << FormatKV("size", static_cast<int64_t>(CI.size)) << ",\n";
-+    out << cache_indent << FormatKV("size", static_cast<int64_t>(CI.size))
-+        << ",\n";
-     out << cache_indent
-         << FormatKV("num_sharing", static_cast<int64_t>(CI.num_sharing))
-         << "\n";
-@@ -157,7 +182,16 @@ bool JSONReporter::ReportContext(const Context& context) {
- #else
-   const char build_type[] = "debug";
- #endif
--  out << indent << FormatKV("library_build_type", build_type) << "\n";
-+  out << indent << FormatKV("library_build_type", build_type);
-+
-+  if (internal::global_context != nullptr) {
-+    for (const auto& kv : *internal::global_context) {
-+      out << ",\n";
-+      out << indent << FormatKV(kv.first, kv.second);
-+    }
-+  }
-+  out << "\n";
-+
-   // Close context block and open the list of benchmarks.
-   out << inner_indent << "},\n";
-   out << inner_indent << "\"benchmarks\": [\n";
-@@ -195,6 +229,10 @@ void JSONReporter::PrintRunData(Run const& run) {
-   std::string indent(6, ' ');
-   std::ostream& out = GetOutputStream();
-   out << indent << FormatKV("name", run.benchmark_name()) << ",\n";
-+  out << indent << FormatKV("family_index", run.family_index) << ",\n";
-+  out << indent
-+      << FormatKV("per_family_instance_index", run.per_family_instance_index)
-+      << ",\n";
-   out << indent << FormatKV("run_name", run.run_name.str()) << ",\n";
-   out << indent << FormatKV("run_type", [&run]() -> const char* {
-     switch (run.run_type) {
-@@ -213,6 +251,15 @@ void JSONReporter::PrintRunData(Run const& run) {
-   out << indent << FormatKV("threads", run.threads) << ",\n";
-   if (run.run_type == BenchmarkReporter::Run::RT_Aggregate) {
-     out << indent << FormatKV("aggregate_name", run.aggregate_name) << ",\n";
-+    out << indent << FormatKV("aggregate_unit", [&run]() -> const char* {
-+      switch (run.aggregate_unit) {
-+        case StatisticUnit::kTime:
-+          return "time";
-+        case StatisticUnit::kPercentage:
-+          return "percentage";
-+      }
-+      BENCHMARK_UNREACHABLE();
-+    }()) << ",\n";
-   }
-   if (run.error_occurred) {
-     out << indent << FormatKV("error_occurred", run.error_occurred) << ",\n";
-@@ -220,8 +267,17 @@ void JSONReporter::PrintRunData(Run const& run) {
-   }
-   if (!run.report_big_o && !run.report_rms) {
-     out << indent << FormatKV("iterations", run.iterations) << ",\n";
--    out << indent << FormatKV("real_time", run.GetAdjustedRealTime()) << ",\n";
--    out << indent << FormatKV("cpu_time", run.GetAdjustedCPUTime());
-+    if (run.run_type != Run::RT_Aggregate ||
-+        run.aggregate_unit == StatisticUnit::kTime) {
-+      out << indent << FormatKV("real_time", run.GetAdjustedRealTime())
-+          << ",\n";
-+      out << indent << FormatKV("cpu_time", run.GetAdjustedCPUTime());
-+    } else {
-+      assert(run.aggregate_unit == StatisticUnit::kPercentage);
-+      out << indent << FormatKV("real_time", run.real_accumulated_time)
-+          << ",\n";
-+      out << indent << FormatKV("cpu_time", run.cpu_accumulated_time);
-+    }
-     out << ",\n"
-         << indent << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
-   } else if (run.report_big_o) {
-diff --git a/lib/benchmark/src/log.h b/lib/benchmark/src/log.h
-index 47d0c35c01..48c071aded 100644
---- a/lib/benchmark/src/log.h
-+++ b/lib/benchmark/src/log.h
-@@ -67,7 +67,7 @@ inline LogType& GetLogInstanceForLevel(int level) {
- }  // end namespace benchmark
- 
- // clang-format off
--#define VLOG(x)                                                               \
-+#define BM_VLOG(x)                                                               \
-   (::benchmark::internal::GetLogInstanceForLevel(x) << "-- LOG(" << x << "):" \
-                                                                          " ")
- // clang-format on
-diff --git a/lib/benchmark/src/mutex.h b/lib/benchmark/src/mutex.h
-index 3fac79aea4..bec78d9e5f 100644
---- a/lib/benchmark/src/mutex.h
-+++ b/lib/benchmark/src/mutex.h
-@@ -9,60 +9,60 @@
- // Enable thread safety attributes only with clang.
- // The attributes can be safely erased when compiling with other compilers.
- #if defined(HAVE_THREAD_SAFETY_ATTRIBUTES)
--#define THREAD_ANNOTATION_ATTRIBUTE__(x) __attribute__((x))
-+#define THREAD_ANNOTATION_ATTRIBUTE_(x) __attribute__((x))
- #else
--#define THREAD_ANNOTATION_ATTRIBUTE__(x)  // no-op
-+#define THREAD_ANNOTATION_ATTRIBUTE_(x)  // no-op
- #endif
- 
--#define CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(capability(x))
-+#define CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(capability(x))
- 
--#define SCOPED_CAPABILITY THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
-+#define SCOPED_CAPABILITY THREAD_ANNOTATION_ATTRIBUTE_(scoped_lockable)
- 
--#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
-+#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE_(guarded_by(x))
- 
--#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x))
-+#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE_(pt_guarded_by(x))
- 
- #define ACQUIRED_BEFORE(...) \
--  THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__))
-+  THREAD_ANNOTATION_ATTRIBUTE_(acquired_before(__VA_ARGS__))
- 
- #define ACQUIRED_AFTER(...) \
--  THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__))
-+  THREAD_ANNOTATION_ATTRIBUTE_(acquired_after(__VA_ARGS__))
- 
- #define REQUIRES(...) \
--  THREAD_ANNOTATION_ATTRIBUTE__(requires_capability(__VA_ARGS__))
-+  THREAD_ANNOTATION_ATTRIBUTE_(requires_capability(__VA_ARGS__))
- 
- #define REQUIRES_SHARED(...) \
--  THREAD_ANNOTATION_ATTRIBUTE__(requires_shared_capability(__VA_ARGS__))
-+  THREAD_ANNOTATION_ATTRIBUTE_(requires_shared_capability(__VA_ARGS__))
- 
- #define ACQUIRE(...) \
--  THREAD_ANNOTATION_ATTRIBUTE__(acquire_capability(__VA_ARGS__))
-+  THREAD_ANNOTATION_ATTRIBUTE_(acquire_capability(__VA_ARGS__))
- 
- #define ACQUIRE_SHARED(...) \
--  THREAD_ANNOTATION_ATTRIBUTE__(acquire_shared_capability(__VA_ARGS__))
-+  THREAD_ANNOTATION_ATTRIBUTE_(acquire_shared_capability(__VA_ARGS__))
- 
- #define RELEASE(...) \
--  THREAD_ANNOTATION_ATTRIBUTE__(release_capability(__VA_ARGS__))
-+  THREAD_ANNOTATION_ATTRIBUTE_(release_capability(__VA_ARGS__))
- 
- #define RELEASE_SHARED(...) \
--  THREAD_ANNOTATION_ATTRIBUTE__(release_shared_capability(__VA_ARGS__))
-+  THREAD_ANNOTATION_ATTRIBUTE_(release_shared_capability(__VA_ARGS__))
- 
- #define TRY_ACQUIRE(...) \
--  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_capability(__VA_ARGS__))
-+  THREAD_ANNOTATION_ATTRIBUTE_(try_acquire_capability(__VA_ARGS__))
- 
- #define TRY_ACQUIRE_SHARED(...) \
--  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_shared_capability(__VA_ARGS__))
-+  THREAD_ANNOTATION_ATTRIBUTE_(try_acquire_shared_capability(__VA_ARGS__))
- 
--#define EXCLUDES(...) THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__))
-+#define EXCLUDES(...) THREAD_ANNOTATION_ATTRIBUTE_(locks_excluded(__VA_ARGS__))
- 
--#define ASSERT_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(assert_capability(x))
-+#define ASSERT_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(assert_capability(x))
- 
- #define ASSERT_SHARED_CAPABILITY(x) \
--  THREAD_ANNOTATION_ATTRIBUTE__(assert_shared_capability(x))
-+  THREAD_ANNOTATION_ATTRIBUTE_(assert_shared_capability(x))
- 
--#define RETURN_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
-+#define RETURN_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(lock_returned(x))
- 
- #define NO_THREAD_SAFETY_ANALYSIS \
--  THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis)
-+  THREAD_ANNOTATION_ATTRIBUTE_(no_thread_safety_analysis)
- 
- namespace benchmark {
- 
-@@ -130,7 +130,7 @@ class Barrier {
-   // entered the barrier.  Returns iff this is the last thread to
-   // enter the barrier.
-   bool createBarrier(MutexLock& ml) REQUIRES(lock_) {
--    CHECK_LT(entered_, running_threads_);
-+    BM_CHECK_LT(entered_, running_threads_);
-     entered_++;
-     if (entered_ < running_threads_) {
-       // Wait for all threads to enter
-diff --git a/lib/benchmark/src/perf_counters.cc b/lib/benchmark/src/perf_counters.cc
-new file mode 100644
-index 0000000000..4ddf0de250
---- /dev/null
-+++ b/lib/benchmark/src/perf_counters.cc
-@@ -0,0 +1,132 @@
-+// Copyright 2021 Google Inc. All rights reserved.
-+//
-+// Licensed under the Apache License, Version 2.0 (the "License");
-+// you may not use this file except in compliance with the License.
-+// You may obtain a copy of the License at
-+//
-+//     http://www.apache.org/licenses/LICENSE-2.0
-+//
-+// Unless required by applicable law or agreed to in writing, software
-+// distributed under the License is distributed on an "AS IS" BASIS,
-+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+// See the License for the specific language governing permissions and
-+// limitations under the License.
-+
-+#include "perf_counters.h"
-+
-+#include <cstring>
-+#include <vector>
-+
-+#if defined HAVE_LIBPFM
-+#include "perfmon/pfmlib.h"
-+#include "perfmon/pfmlib_perf_event.h"
-+#endif
-+
-+namespace benchmark {
-+namespace internal {
-+
-+constexpr size_t PerfCounterValues::kMaxCounters;
-+
-+#if defined HAVE_LIBPFM
-+const bool PerfCounters::kSupported = true;
-+
-+bool PerfCounters::Initialize() { return pfm_initialize() == PFM_SUCCESS; }
-+
-+PerfCounters PerfCounters::Create(
-+    const std::vector<std::string>& counter_names) {
-+  if (counter_names.empty()) {
-+    return NoCounters();
-+  }
-+  if (counter_names.size() > PerfCounterValues::kMaxCounters) {
-+    GetErrorLogInstance()
-+        << counter_names.size()
-+        << " counters were requested. The minimum is 1, the maximum is "
-+        << PerfCounterValues::kMaxCounters << "\n";
-+    return NoCounters();
-+  }
-+  std::vector<int> counter_ids(counter_names.size());
-+
-+  const int mode = PFM_PLM3;  // user mode only
-+  for (size_t i = 0; i < counter_names.size(); ++i) {
-+    const bool is_first = i == 0;
-+    struct perf_event_attr attr{};
-+    attr.size = sizeof(attr);
-+    const int group_id = !is_first ? counter_ids[0] : -1;
-+    const auto& name = counter_names[i];
-+    if (name.empty()) {
-+      GetErrorLogInstance() << "A counter name was the empty string\n";
-+      return NoCounters();
-+    }
-+    pfm_perf_encode_arg_t arg{};
-+    arg.attr = &attr;
-+
-+    const int pfm_get =
-+        pfm_get_os_event_encoding(name.c_str(), mode, PFM_OS_PERF_EVENT, &arg);
-+    if (pfm_get != PFM_SUCCESS) {
-+      GetErrorLogInstance() << "Unknown counter name: " << name << "\n";
-+      return NoCounters();
-+    }
-+    attr.disabled = is_first;
-+    // Note: the man page for perf_event_create suggests inerit = true and
-+    // read_format = PERF_FORMAT_GROUP don't work together, but that's not the
-+    // case.
-+    attr.inherit = true;
-+    attr.pinned = is_first;
-+    attr.exclude_kernel = true;
-+    attr.exclude_user = false;
-+    attr.exclude_hv = true;
-+    // Read all counters in one read.
-+    attr.read_format = PERF_FORMAT_GROUP;
-+
-+    int id = -1;
-+    static constexpr size_t kNrOfSyscallRetries = 5;
-+    // Retry syscall as it was interrupted often (b/64774091).
-+    for (size_t num_retries = 0; num_retries < kNrOfSyscallRetries;
-+         ++num_retries) {
-+      id = perf_event_open(&attr, 0, -1, group_id, 0);
-+      if (id >= 0 || errno != EINTR) {
-+        break;
-+      }
-+    }
-+    if (id < 0) {
-+      GetErrorLogInstance()
-+          << "Failed to get a file descriptor for " << name << "\n";
-+      return NoCounters();
-+    }
-+
-+    counter_ids[i] = id;
-+  }
-+  if (ioctl(counter_ids[0], PERF_EVENT_IOC_ENABLE) != 0) {
-+    GetErrorLogInstance() << "Failed to start counters\n";
-+    return NoCounters();
-+  }
-+
-+  return PerfCounters(counter_names, std::move(counter_ids));
-+}
-+
-+PerfCounters::~PerfCounters() {
-+  if (counter_ids_.empty()) {
-+    return;
-+  }
-+  ioctl(counter_ids_[0], PERF_EVENT_IOC_DISABLE);
-+  for (int fd : counter_ids_) {
-+    close(fd);
-+  }
-+}
-+#else   // defined HAVE_LIBPFM
-+const bool PerfCounters::kSupported = false;
-+
-+bool PerfCounters::Initialize() { return false; }
-+
-+PerfCounters PerfCounters::Create(
-+    const std::vector<std::string>& counter_names) {
-+  if (!counter_names.empty()) {
-+    GetErrorLogInstance() << "Performance counters not supported.";
-+  }
-+  return NoCounters();
-+}
-+
-+PerfCounters::~PerfCounters() = default;
-+#endif  // defined HAVE_LIBPFM
-+}  // namespace internal
-+}  // namespace benchmark
-diff --git a/lib/benchmark/src/perf_counters.h b/lib/benchmark/src/perf_counters.h
-new file mode 100644
-index 0000000000..47ca1385e2
---- /dev/null
-+++ b/lib/benchmark/src/perf_counters.h
-@@ -0,0 +1,172 @@
-+// Copyright 2021 Google Inc. All rights reserved.
-+//
-+// Licensed under the Apache License, Version 2.0 (the "License");
-+// you may not use this file except in compliance with the License.
-+// You may obtain a copy of the License at
-+//
-+//     http://www.apache.org/licenses/LICENSE-2.0
-+//
-+// Unless required by applicable law or agreed to in writing, software
-+// distributed under the License is distributed on an "AS IS" BASIS,
-+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+// See the License for the specific language governing permissions and
-+// limitations under the License.
-+
-+#ifndef BENCHMARK_PERF_COUNTERS_H
-+#define BENCHMARK_PERF_COUNTERS_H
-+
-+#include <array>
-+#include <cstdint>
-+#include <vector>
-+
-+#include "benchmark/benchmark.h"
-+#include "check.h"
-+#include "log.h"
-+
-+#ifndef BENCHMARK_OS_WINDOWS
-+#include <unistd.h>
-+#endif
-+
-+namespace benchmark {
-+namespace internal {
-+
-+// Typically, we can only read a small number of counters. There is also a
-+// padding preceding counter values, when reading multiple counters with one
-+// syscall (which is desirable). PerfCounterValues abstracts these details.
-+// The implementation ensures the storage is inlined, and allows 0-based
-+// indexing into the counter values.
-+// The object is used in conjunction with a PerfCounters object, by passing it
-+// to Snapshot(). The values are populated such that
-+// perfCounters->names()[i]'s value is obtained at position i (as given by
-+// operator[]) of this object.
-+class PerfCounterValues {
-+ public:
-+  explicit PerfCounterValues(size_t nr_counters) : nr_counters_(nr_counters) {
-+    BM_CHECK_LE(nr_counters_, kMaxCounters);
-+  }
-+
-+  uint64_t operator[](size_t pos) const { return values_[kPadding + pos]; }
-+
-+  static constexpr size_t kMaxCounters = 3;
-+
-+ private:
-+  friend class PerfCounters;
-+  // Get the byte buffer in which perf counters can be captured.
-+  // This is used by PerfCounters::Read
-+  std::pair<char*, size_t> get_data_buffer() {
-+    return {reinterpret_cast<char*>(values_.data()),
-+            sizeof(uint64_t) * (kPadding + nr_counters_)};
-+  }
-+
-+  static constexpr size_t kPadding = 1;
-+  std::array<uint64_t, kPadding + kMaxCounters> values_;
-+  const size_t nr_counters_;
-+};
-+
-+// Collect PMU counters. The object, once constructed, is ready to be used by
-+// calling read(). PMU counter collection is enabled from the time create() is
-+// called, to obtain the object, until the object's destructor is called.
-+class PerfCounters final {
-+ public:
-+  // True iff this platform supports performance counters.
-+  static const bool kSupported;
-+
-+  bool IsValid() const { return is_valid_; }
-+  static PerfCounters NoCounters() { return PerfCounters(); }
-+
-+  ~PerfCounters();
-+  PerfCounters(PerfCounters&&) = default;
-+  PerfCounters(const PerfCounters&) = delete;
-+
-+  // Platform-specific implementations may choose to do some library
-+  // initialization here.
-+  static bool Initialize();
-+
-+  // Return a PerfCounters object ready to read the counters with the names
-+  // specified. The values are user-mode only. The counter name format is
-+  // implementation and OS specific.
-+  // TODO: once we move to C++-17, this should be a std::optional, and then the
-+  // IsValid() boolean can be dropped.
-+  static PerfCounters Create(const std::vector<std::string>& counter_names);
-+
-+  // Take a snapshot of the current value of the counters into the provided
-+  // valid PerfCounterValues storage. The values are populated such that:
-+  // names()[i]'s value is (*values)[i]
-+  BENCHMARK_ALWAYS_INLINE bool Snapshot(PerfCounterValues* values) const {
-+#ifndef BENCHMARK_OS_WINDOWS
-+    assert(values != nullptr);
-+    assert(IsValid());
-+    auto buffer = values->get_data_buffer();
-+    auto read_bytes = ::read(counter_ids_[0], buffer.first, buffer.second);
-+    return static_cast<size_t>(read_bytes) == buffer.second;
-+#else
-+    (void)values;
-+    return false;
-+#endif
-+  }
-+
-+  const std::vector<std::string>& names() const { return counter_names_; }
-+  size_t num_counters() const { return counter_names_.size(); }
-+
-+ private:
-+  PerfCounters(const std::vector<std::string>& counter_names,
-+               std::vector<int>&& counter_ids)
-+      : counter_ids_(std::move(counter_ids)),
-+        counter_names_(counter_names),
-+        is_valid_(true) {}
-+  PerfCounters() : is_valid_(false) {}
-+
-+  std::vector<int> counter_ids_;
-+  const std::vector<std::string> counter_names_;
-+  const bool is_valid_;
-+};
-+
-+// Typical usage of the above primitives.
-+class PerfCountersMeasurement final {
-+ public:
-+  PerfCountersMeasurement(PerfCounters&& c)
-+      : counters_(std::move(c)),
-+        start_values_(counters_.IsValid() ? counters_.names().size() : 0),
-+        end_values_(counters_.IsValid() ? counters_.names().size() : 0) {}
-+
-+  bool IsValid() const { return counters_.IsValid(); }
-+
-+  BENCHMARK_ALWAYS_INLINE void Start() {
-+    assert(IsValid());
-+    // Tell the compiler to not move instructions above/below where we take
-+    // the snapshot.
-+    ClobberMemory();
-+    counters_.Snapshot(&start_values_);
-+    ClobberMemory();
-+  }
-+
-+  BENCHMARK_ALWAYS_INLINE std::vector<std::pair<std::string, double>>
-+  StopAndGetMeasurements() {
-+    assert(IsValid());
-+    // Tell the compiler to not move instructions above/below where we take
-+    // the snapshot.
-+    ClobberMemory();
-+    counters_.Snapshot(&end_values_);
-+    ClobberMemory();
-+
-+    std::vector<std::pair<std::string, double>> ret;
-+    for (size_t i = 0; i < counters_.names().size(); ++i) {
-+      double measurement = static_cast<double>(end_values_[i]) -
-+                           static_cast<double>(start_values_[i]);
-+      ret.push_back({counters_.names()[i], measurement});
-+    }
-+    return ret;
-+  }
-+
-+ private:
-+  PerfCounters counters_;
-+  PerfCounterValues start_values_;
-+  PerfCounterValues end_values_;
-+};
-+
-+BENCHMARK_UNUSED static bool perf_init_anchor = PerfCounters::Initialize();
-+
-+}  // namespace internal
-+}  // namespace benchmark
-+
-+#endif  // BENCHMARK_PERF_COUNTERS_H
-diff --git a/lib/benchmark/src/re.h b/lib/benchmark/src/re.h
-index fbe25037b4..630046782d 100644
---- a/lib/benchmark/src/re.h
-+++ b/lib/benchmark/src/re.h
-@@ -126,7 +126,7 @@ inline bool Regex::Init(const std::string& spec, std::string* error) {
- 
-       // regerror returns the number of bytes necessary to null terminate
-       // the string, so we move that when assigning to error.
--      CHECK_NE(needed, 0);
-+      BM_CHECK_NE(needed, 0);
-       error->assign(errbuf, needed - 1);
- 
-       delete[] errbuf;
-diff --git a/lib/benchmark/src/reporter.cc b/lib/benchmark/src/reporter.cc
-index 0b54fa421a..c720a9df1d 100644
---- a/lib/benchmark/src/reporter.cc
-+++ b/lib/benchmark/src/reporter.cc
-@@ -18,6 +18,8 @@
- #include <cstdlib>
- 
- #include <iostream>
-+#include <map>
-+#include <string>
- #include <tuple>
- #include <vector>
- 
-@@ -25,6 +27,9 @@
- #include "string_util.h"
- 
- namespace benchmark {
-+namespace internal {
-+extern std::map<std::string, std::string>* global_context;
-+}
- 
- BenchmarkReporter::BenchmarkReporter()
-     : output_stream_(&std::cout), error_stream_(&std::cerr) {}
-@@ -33,7 +38,7 @@ BenchmarkReporter::~BenchmarkReporter() {}
- 
- void BenchmarkReporter::PrintBasicContext(std::ostream *out,
-                                           Context const &context) {
--  CHECK(out) << "cannot be null";
-+  BM_CHECK(out) << "cannot be null";
-   auto &Out = *out;
- 
-   Out << LocalDateTimeString() << "\n";
-@@ -64,7 +69,13 @@ void BenchmarkReporter::PrintBasicContext(std::ostream *out,
-     Out << "\n";
-   }
- 
--  if (info.scaling_enabled) {
-+  if (internal::global_context != nullptr) {
-+    for (const auto& kv: *internal::global_context) {
-+      Out << kv.first << ": " << kv.second << "\n";
-+    }
-+  }
-+
-+  if (CPUInfo::Scaling::ENABLED == info.scaling) {
-     Out << "***WARNING*** CPU scaling is enabled, the benchmark "
-            "real time measurements may be noisy and will incur extra "
-            "overhead.\n";
-diff --git a/lib/benchmark/src/sleep.cc b/lib/benchmark/src/sleep.cc
-index 1512ac90f7..4609d540ea 100644
---- a/lib/benchmark/src/sleep.cc
-+++ b/lib/benchmark/src/sleep.cc
-@@ -24,6 +24,10 @@
- #include <windows.h>
- #endif
- 
-+#ifdef BENCHMARK_OS_ZOS
-+#include <unistd.h>
-+#endif
-+
- namespace benchmark {
- #ifdef BENCHMARK_OS_WINDOWS
- // Window's Sleep takes milliseconds argument.
-@@ -33,11 +37,23 @@ void SleepForSeconds(double seconds) {
- }
- #else   // BENCHMARK_OS_WINDOWS
- void SleepForMicroseconds(int microseconds) {
-+#ifdef BENCHMARK_OS_ZOS
-+  // z/OS does not support nanosleep. Instead call sleep() and then usleep() to
-+  // sleep for the remaining microseconds because usleep() will fail if its
-+  // argument is greater than 1000000.
-+  div_t sleepTime = div(microseconds, kNumMicrosPerSecond);
-+  int seconds = sleepTime.quot;
-+  while (seconds != 0)
-+    seconds = sleep(seconds);
-+  while (usleep(sleepTime.rem) == -1 && errno == EINTR)
-+    ;
-+#else
-   struct timespec sleep_time;
-   sleep_time.tv_sec = microseconds / kNumMicrosPerSecond;
-   sleep_time.tv_nsec = (microseconds % kNumMicrosPerSecond) * kNumNanosPerMicro;
-   while (nanosleep(&sleep_time, &sleep_time) != 0 && errno == EINTR)
-     ;  // Ignore signals and wait for the full interval to elapse.
-+#endif
- }
- 
- void SleepForMilliseconds(int milliseconds) {
-diff --git a/lib/benchmark/src/statistics.cc b/lib/benchmark/src/statistics.cc
-index bd5a3d6597..00ae97dfa8 100644
---- a/lib/benchmark/src/statistics.cc
-+++ b/lib/benchmark/src/statistics.cc
-@@ -74,6 +74,15 @@ double StatisticsStdDev(const std::vector<double>& v) {
-   return Sqrt(v.size() / (v.size() - 1.0) * (avg_squares - Sqr(mean)));
- }
- 
-+double StatisticsCV(const std::vector<double>& v) {
-+  if (v.size() < 2) return 0.0;
-+
-+  const auto stddev = StatisticsStdDev(v);
-+  const auto mean = StatisticsMean(v);
-+
-+  return stddev / mean;
-+}
-+
- std::vector<BenchmarkReporter::Run> ComputeStats(
-     const std::vector<BenchmarkReporter::Run>& reports) {
-   typedef BenchmarkReporter::Run Run;
-@@ -112,22 +121,22 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
-         it = counter_stats.find(cnt.first);
-         it->second.s.reserve(reports.size());
-       } else {
--        CHECK_EQ(counter_stats[cnt.first].c.flags, cnt.second.flags);
-+        BM_CHECK_EQ(counter_stats[cnt.first].c.flags, cnt.second.flags);
-       }
-     }
-   }
- 
-   // Populate the accumulators.
-   for (Run const& run : reports) {
--    CHECK_EQ(reports[0].benchmark_name(), run.benchmark_name());
--    CHECK_EQ(run_iterations, run.iterations);
-+    BM_CHECK_EQ(reports[0].benchmark_name(), run.benchmark_name());
-+    BM_CHECK_EQ(run_iterations, run.iterations);
-     if (run.error_occurred) continue;
-     real_accumulated_time_stat.emplace_back(run.real_accumulated_time);
-     cpu_accumulated_time_stat.emplace_back(run.cpu_accumulated_time);
-     // user counters
-     for (auto const& cnt : run.counters) {
-       auto it = counter_stats.find(cnt.first);
--      CHECK_NE(it, counter_stats.end());
-+      BM_CHECK_NE(it, counter_stats.end());
-       it->second.s.emplace_back(cnt.second);
-     }
-   }
-@@ -148,11 +157,14 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
-     // Get the data from the accumulator to BenchmarkReporter::Run's.
-     Run data;
-     data.run_name = reports[0].run_name;
-+    data.family_index = reports[0].family_index;
-+    data.per_family_instance_index = reports[0].per_family_instance_index;
-     data.run_type = BenchmarkReporter::Run::RT_Aggregate;
-     data.threads = reports[0].threads;
-     data.repetitions = reports[0].repetitions;
-     data.repetition_index = Run::no_repetition_index;
-     data.aggregate_name = Stat.name_;
-+    data.aggregate_unit = Stat.unit_;
-     data.report_label = report_label;
- 
-     // It is incorrect to say that an aggregate is computed over
-@@ -165,13 +177,15 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
-     data.real_accumulated_time = Stat.compute_(real_accumulated_time_stat);
-     data.cpu_accumulated_time = Stat.compute_(cpu_accumulated_time_stat);
- 
--    // We will divide these times by data.iterations when reporting, but the
--    // data.iterations is not nessesairly the scale of these measurements,
--    // because in each repetition, these timers are sum over all the iterations.
--    // And if we want to say that the stats are over N repetitions and not
--    // M iterations, we need to multiply these by (N/M).
--    data.real_accumulated_time *= iteration_rescale_factor;
--    data.cpu_accumulated_time *= iteration_rescale_factor;
-+    if (data.aggregate_unit == StatisticUnit::kTime) {
-+      // We will divide these times by data.iterations when reporting, but the
-+      // data.iterations is not necessarily the scale of these measurements,
-+      // because in each repetition, these timers are sum over all the iters.
-+      // And if we want to say that the stats are over N repetitions and not
-+      // M iterations, we need to multiply these by (N/M).
-+      data.real_accumulated_time *= iteration_rescale_factor;
-+      data.cpu_accumulated_time *= iteration_rescale_factor;
-+    }
- 
-     data.time_unit = reports[0].time_unit;
- 
-diff --git a/lib/benchmark/src/statistics.h b/lib/benchmark/src/statistics.h
-index 7eccc85536..a9545a58c6 100644
---- a/lib/benchmark/src/statistics.h
-+++ b/lib/benchmark/src/statistics.h
-@@ -31,6 +31,7 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
- double StatisticsMean(const std::vector<double>& v);
- double StatisticsMedian(const std::vector<double>& v);
- double StatisticsStdDev(const std::vector<double>& v);
-+double StatisticsCV(const std::vector<double>& v);
- 
- }  // end namespace benchmark
- 
-diff --git a/lib/benchmark/src/string_util.cc b/lib/benchmark/src/string_util.cc
-index 39b01a1719..3551418174 100644
---- a/lib/benchmark/src/string_util.cc
-+++ b/lib/benchmark/src/string_util.cc
-@@ -1,6 +1,9 @@
- #include "string_util.h"
- 
- #include <array>
-+#ifdef BENCHMARK_STL_ANDROID_GNUSTL
-+#include <cerrno>
-+#endif
- #include <cmath>
- #include <cstdarg>
- #include <cstdio>
-@@ -160,6 +163,19 @@ std::string StrFormat(const char* format, ...) {
-   return tmp;
- }
- 
-+std::vector<std::string> StrSplit(const std::string& str, char delim) {
-+  if (str.empty()) return {};
-+  std::vector<std::string> ret;
-+  size_t first = 0;
-+  size_t next = str.find(delim);
-+  for (; next != std::string::npos;
-+       first = next + 1, next = str.find(delim, first)) {
-+    ret.push_back(str.substr(first, next - first));
-+  }
-+  ret.push_back(str.substr(first));
-+  return ret;
-+}
-+
- #ifdef BENCHMARK_STL_ANDROID_GNUSTL
- /*
-  * GNU STL in Android NDK lacks support for some C++11 functions, including
-diff --git a/lib/benchmark/src/string_util.h b/lib/benchmark/src/string_util.h
-index 09d7b4bd2a..6bc28b6912 100644
---- a/lib/benchmark/src/string_util.h
-+++ b/lib/benchmark/src/string_util.h
-@@ -37,6 +37,8 @@ inline std::string StrCat(Args&&... args) {
-   return ss.str();
- }
- 
-+std::vector<std::string> StrSplit(const std::string& str, char delim);
-+
- #ifdef BENCHMARK_STL_ANDROID_GNUSTL
- /*
-  * GNU STL in Android NDK lacks support for some C++11 functions, including
-diff --git a/lib/benchmark/src/sysinfo.cc b/lib/benchmark/src/sysinfo.cc
-index 5b7c4af780..937604fe58 100644
---- a/lib/benchmark/src/sysinfo.cc
-+++ b/lib/benchmark/src/sysinfo.cc
-@@ -29,7 +29,8 @@
- #include <sys/types.h>  // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD
- #include <unistd.h>
- #if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX || \
--    defined BENCHMARK_OS_NETBSD || defined BENCHMARK_OS_OPENBSD
-+    defined BENCHMARK_OS_NETBSD || defined BENCHMARK_OS_OPENBSD || \
-+    defined BENCHMARK_OS_DRAGONFLY
- #define BENCHMARK_HAS_SYSCTL
- #include <sys/sysctl.h>
- #endif
-@@ -57,6 +58,7 @@
- #include <memory>
- #include <sstream>
- #include <locale>
-+#include <utility>
- 
- #include "check.h"
- #include "cycleclock.h"
-@@ -133,7 +135,7 @@ struct ValueUnion {
-   template <class T, int N>
-   std::array<T, N> GetAsArray() {
-     const int ArrSize = sizeof(T) * N;
--    CHECK_LE(ArrSize, Size);
-+    BM_CHECK_LE(ArrSize, Size);
-     std::array<T, N> Arr;
-     std::memcpy(Arr.data(), data(), ArrSize);
-     return Arr;
-@@ -209,13 +211,12 @@ bool ReadFromFile(std::string const& fname, ArgT* arg) {
-   return f.good();
- }
- 
--bool CpuScalingEnabled(int num_cpus) {
-+CPUInfo::Scaling CpuScaling(int num_cpus) {
-   // We don't have a valid CPU count, so don't even bother.
--  if (num_cpus <= 0) return false;
--#ifdef BENCHMARK_OS_QNX
--  return false;
--#endif
--#ifndef BENCHMARK_OS_WINDOWS
-+  if (num_cpus <= 0) return CPUInfo::Scaling::UNKNOWN;
-+#if defined(BENCHMARK_OS_QNX)
-+  return CPUInfo::Scaling::UNKNOWN;
-+#elif !defined(BENCHMARK_OS_WINDOWS)
-   // On Linux, the CPUfreq subsystem exposes CPU information as files on the
-   // local file system. If reading the exported files fails, then we may not be
-   // running on Linux, so we silently ignore all the read errors.
-@@ -223,10 +224,12 @@ bool CpuScalingEnabled(int num_cpus) {
-   for (int cpu = 0; cpu < num_cpus; ++cpu) {
-     std::string governor_file =
-         StrCat("/sys/devices/system/cpu/cpu", cpu, "/cpufreq/scaling_governor");
--    if (ReadFromFile(governor_file, &res) && res != "performance") return true;
-+    if (ReadFromFile(governor_file, &res) && res != "performance") return CPUInfo::Scaling::ENABLED;
-   }
-+  return CPUInfo::Scaling::DISABLED;
-+#else
-+  return CPUInfo::Scaling::UNKNOWN;
- #endif
--  return false;
- }
- 
- int CountSetBitsInCPUMap(std::string Val) {
-@@ -382,9 +385,11 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesQNX() {
-       case CACHE_FLAG_UNIFIED :
-         info.type = "Unified";
-         info.level = 2;
-+        break;
-       case CACHE_FLAG_SHARED :
-         info.type = "Shared";
-         info.level = 3;
-+        break;
-       default :
-         continue;
-         break;
-@@ -439,7 +444,7 @@ std::string GetSystemName() {
- #elif defined(BENCHMARK_OS_RTEMS)
- #define HOST_NAME_MAX 256
- #else
--#warning "HOST_NAME_MAX not defined. using 64"
-+#pragma message("HOST_NAME_MAX not defined. using 64")
- #define HOST_NAME_MAX 64
- #endif
- #endif // def HOST_NAME_MAX
-@@ -525,7 +530,11 @@ int GetNumCPUs() {
-   BENCHMARK_UNREACHABLE();
- }
- 
--double GetCPUCyclesPerSecond() {
-+double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
-+  // Currently, scaling is only used on linux path here,
-+  // suppress diagnostics about it being unused on other paths.
-+  (void)scaling;
-+
- #if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN
-   long freq;
- 
-@@ -536,8 +545,15 @@ double GetCPUCyclesPerSecond() {
-   // cannot always be relied upon. The same reasons apply to /proc/cpuinfo as
-   // well.
-   if (ReadFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz", &freq)
--      // If CPU scaling is in effect, we want to use the *maximum* frequency,
--      // not whatever CPU speed some random processor happens to be using now.
-+      // If CPU scaling is disabled, use the *current* frequency.
-+      // Note that we specifically don't want to read cpuinfo_cur_freq,
-+      // because it is only readable by root.
-+      || (scaling == CPUInfo::Scaling::DISABLED &&
-+          ReadFromFile("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq",
-+                       &freq))
-+      // Otherwise, if CPU scaling may be in effect, we want to use
-+      // the *maximum* frequency, not whatever CPU speed some random processor
-+      // happens to be using now.
-       || ReadFromFile("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq",
-                       &freq)) {
-     // The value is in kHz (as the file name suggests).  For example, on a
-@@ -603,6 +619,8 @@ double GetCPUCyclesPerSecond() {
-       "machdep.tsc_freq";
- #elif defined BENCHMARK_OS_OPENBSD
-       "hw.cpuspeed";
-+#elif defined BENCHMARK_OS_DRAGONFLY
-+      "hw.tsc_frequency";
- #else
-       "hw.cpufrequency";
- #endif
-@@ -667,9 +685,10 @@ double GetCPUCyclesPerSecond() {
- }
- 
- std::vector<double> GetLoadAvg() {
--#if (defined BENCHMARK_OS_FREEBSD || defined(BENCHMARK_OS_LINUX) || \
--    defined BENCHMARK_OS_MACOSX || defined BENCHMARK_OS_NETBSD ||  \
--    defined BENCHMARK_OS_OPENBSD) && !defined(__ANDROID__)
-+#if (defined BENCHMARK_OS_FREEBSD || defined(BENCHMARK_OS_LINUX) ||     \
-+     defined BENCHMARK_OS_MACOSX || defined BENCHMARK_OS_NETBSD ||      \
-+     defined BENCHMARK_OS_OPENBSD || defined BENCHMARK_OS_DRAGONFLY) && \
-+    !defined(__ANDROID__)
-   constexpr int kMaxSamples = 3;
-   std::vector<double> res(kMaxSamples, 0.0);
-   const int nelem = getloadavg(res.data(), kMaxSamples);
-@@ -693,12 +712,11 @@ const CPUInfo& CPUInfo::Get() {
- 
- CPUInfo::CPUInfo()
-     : num_cpus(GetNumCPUs()),
--      cycles_per_second(GetCPUCyclesPerSecond()),
-+      scaling(CpuScaling(num_cpus)),
-+      cycles_per_second(GetCPUCyclesPerSecond(scaling)),
-       caches(GetCacheSizes()),
--      scaling_enabled(CpuScalingEnabled(num_cpus)),
-       load_avg(GetLoadAvg()) {}
- 
--
- const SystemInfo& SystemInfo::Get() {
-   static const SystemInfo* info = new SystemInfo();
-   return *info;
-diff --git a/lib/benchmark/src/thread_timer.h b/lib/benchmark/src/thread_timer.h
-index 1703ca0d6f..eb23f59561 100644
---- a/lib/benchmark/src/thread_timer.h
-+++ b/lib/benchmark/src/thread_timer.h
-@@ -28,7 +28,7 @@ class ThreadTimer {
- 
-   // Called by each thread
-   void StopTimer() {
--    CHECK(running_);
-+    BM_CHECK(running_);
-     running_ = false;
-     real_time_used_ += ChronoClockNow() - start_real_time_;
-     // Floating point error can result in the subtraction producing a negative
-@@ -44,19 +44,19 @@ class ThreadTimer {
- 
-   // REQUIRES: timer is not running
-   double real_time_used() const {
--    CHECK(!running_);
-+    BM_CHECK(!running_);
-     return real_time_used_;
-   }
- 
-   // REQUIRES: timer is not running
-   double cpu_time_used() const {
--    CHECK(!running_);
-+    BM_CHECK(!running_);
-     return cpu_time_used_;
-   }
- 
-   // REQUIRES: timer is not running
-   double manual_time_used() const {
--    CHECK(!running_);
-+    BM_CHECK(!running_);
-     return manual_time_used_;
-   }
- 
-diff --git a/lib/benchmark/src/timers.cc b/lib/benchmark/src/timers.cc
-index 7613ff92c6..1f05574269 100644
---- a/lib/benchmark/src/timers.cc
-+++ b/lib/benchmark/src/timers.cc
-@@ -28,7 +28,8 @@
- #include <sys/time.h>
- #include <sys/types.h>  // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD
- #include <unistd.h>
--#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX
-+#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_DRAGONFLY || \
-+    defined BENCHMARK_OS_MACOSX
- #include <sys/sysctl.h>
- #endif
- #if defined(BENCHMARK_OS_MACOSX)
-@@ -178,40 +179,75 @@ double ThreadCPUUsage() {
- #endif
- }
- 
--namespace {
--
--std::string DateTimeString(bool local) {
-+std::string LocalDateTimeString() {
-+  // Write the local time in RFC3339 format yyyy-mm-ddTHH:MM:SS+/-HH:MM.
-   typedef std::chrono::system_clock Clock;
-   std::time_t now = Clock::to_time_t(Clock::now());
--  const std::size_t kStorageSize = 128;
--  char storage[kStorageSize];
--  std::size_t written;
-+  const std::size_t kTzOffsetLen = 6;
-+  const std::size_t kTimestampLen = 19;
-+
-+  std::size_t tz_len;
-+  std::size_t timestamp_len;
-+  long int offset_minutes;
-+  char tz_offset_sign = '+';
-+  // tz_offset is set in one of three ways:
-+  // * strftime with %z - This either returns empty or the ISO 8601 time.  The maximum length an
-+  //   ISO 8601 string can be is 7 (e.g. -03:30, plus trailing zero).
-+  // * snprintf with %c%02li:%02li - The maximum length is 41 (one for %c, up to 19 for %02li,
-+  //   one for :, up to 19 %02li, plus trailing zero).
-+  // * A fixed string of "-00:00".  The maximum length is 7 (-00:00, plus trailing zero).
-+  //
-+  // Thus, the maximum size this needs to be is 41.
-+  char tz_offset[41];
-+  // Long enough buffer to avoid format-overflow warnings
-+  char storage[128];
- 
--  if (local) {
- #if defined(BENCHMARK_OS_WINDOWS)
--    written =
--        std::strftime(storage, sizeof(storage), "%x %X", ::localtime(&now));
-+  std::tm *timeinfo_p = ::localtime(&now);
- #else
--    std::tm timeinfo;
--    ::localtime_r(&now, &timeinfo);
--    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
-+  std::tm timeinfo;
-+  std::tm *timeinfo_p = &timeinfo;
-+  ::localtime_r(&now, &timeinfo);
- #endif
-+
-+  tz_len = std::strftime(tz_offset, sizeof(tz_offset), "%z", timeinfo_p);
-+
-+  if (tz_len < kTzOffsetLen && tz_len > 1) {
-+    // Timezone offset was written. strftime writes offset as +HHMM or -HHMM,
-+    // RFC3339 specifies an offset as +HH:MM or -HH:MM. To convert, we parse
-+    // the offset as an integer, then reprint it to a string.
-+
-+    offset_minutes = ::strtol(tz_offset, NULL, 10);
-+    if (offset_minutes < 0) {
-+      offset_minutes *= -1;
-+      tz_offset_sign = '-';
-+    }
-+
-+    tz_len = ::snprintf(tz_offset, sizeof(tz_offset), "%c%02li:%02li",
-+        tz_offset_sign, offset_minutes / 100, offset_minutes % 100);
-+    BM_CHECK(tz_len == kTzOffsetLen);
-+    ((void)tz_len); // Prevent unused variable warning in optimized build.
-   } else {
-+    // Unknown offset. RFC3339 specifies that unknown local offsets should be
-+    // written as UTC time with -00:00 timezone.
- #if defined(BENCHMARK_OS_WINDOWS)
--    written = std::strftime(storage, sizeof(storage), "%x %X", ::gmtime(&now));
-+    // Potential race condition if another thread calls localtime or gmtime.
-+    timeinfo_p = ::gmtime(&now);
- #else
--    std::tm timeinfo;
-     ::gmtime_r(&now, &timeinfo);
--    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
- #endif
-+
-+    strncpy(tz_offset, "-00:00", kTzOffsetLen + 1);
-   }
--  CHECK(written < kStorageSize);
--  ((void)written);  // prevent unused variable in optimized mode.
--  return std::string(storage);
--}
- 
--}  // end namespace
-+  timestamp_len = std::strftime(storage, sizeof(storage), "%Y-%m-%dT%H:%M:%S",
-+      timeinfo_p);
-+  BM_CHECK(timestamp_len == kTimestampLen);
-+  // Prevent unused variable warning in optimized build.
-+  ((void)kTimestampLen);
- 
--std::string LocalDateTimeString() { return DateTimeString(true); }
-+  std::strncat(storage, tz_offset, sizeof(storage) - timestamp_len - 1);
-+  return std::string(storage);
-+}
- 
- }  // end namespace benchmark
-diff --git a/lib/benchmark/tools/compare.py b/lib/benchmark/tools/compare.py
-index 539ace6fb1..01d2c89f50 100755
---- a/lib/benchmark/tools/compare.py
-+++ b/lib/benchmark/tools/compare.py
-@@ -7,6 +7,7 @@ compare.py - versatile benchmark output compare tool
- 
- import argparse
- from argparse import ArgumentParser
-+import json
- import sys
- import gbench
- from gbench import util, report
-@@ -48,6 +49,20 @@ def create_parser():
-              "of repetitions. Do note that only the display is affected. "
-              "Internally, all the actual runs are still used, e.g. for U test.")
- 
-+    parser.add_argument(
-+        '--no-color',
-+        dest='color',
-+        default=True,
-+        action="store_false",
-+        help="Do not use colors in the terminal output"
-+    )
-+
-+    parser.add_argument(
-+        '-d',
-+        '--dump_to_json',
-+        dest='dump_to_json',
-+        help="Additionally, dump benchmark comparison output to this file in JSON format.")
-+
-     utest = parser.add_argument_group()
-     utest.add_argument(
-         '--no-utest',
-@@ -223,10 +238,10 @@ def main():
-         options_contender = ['--benchmark_filter=%s' % filter_contender]
- 
-     # Run the benchmarks and report the results
--    json1 = json1_orig = gbench.util.run_or_load_benchmark(
--        test_baseline, benchmark_options + options_baseline)
--    json2 = json2_orig = gbench.util.run_or_load_benchmark(
--        test_contender, benchmark_options + options_contender)
-+    json1 = json1_orig = gbench.util.sort_benchmark_results(gbench.util.run_or_load_benchmark(
-+        test_baseline, benchmark_options + options_baseline))
-+    json2 = json2_orig = gbench.util.sort_benchmark_results(gbench.util.run_or_load_benchmark(
-+        test_contender, benchmark_options + options_contender))
- 
-     # Now, filter the benchmarks so that the difference report can work
-     if filter_baseline and filter_contender:
-@@ -236,14 +251,20 @@ def main():
-         json2 = gbench.report.filter_benchmark(
-             json2_orig, filter_contender, replacement)
- 
--    # Diff and output
--    output_lines = gbench.report.generate_difference_report(
--        json1, json2, args.display_aggregates_only,
--        args.utest, args.utest_alpha)
-+    diff_report = gbench.report.get_difference_report(
-+        json1, json2, args.utest)
-+    output_lines = gbench.report.print_difference_report(
-+        diff_report,
-+        args.display_aggregates_only,
-+        args.utest, args.utest_alpha, args.color)
-     print(description)
-     for ln in output_lines:
-         print(ln)
- 
-+    # Optionally, diff and output to JSON
-+    if args.dump_to_json is not None:
-+        with open(args.dump_to_json, 'w') as f_json:
-+            json.dump(diff_report, f_json)
- 
- class TestParser(unittest.TestCase):
-     def setUp(self):
-diff --git a/lib/benchmark/tools/gbench/report.py b/lib/benchmark/tools/gbench/report.py
-index 5bd3a8d85d..8203cbad02 100644
---- a/lib/benchmark/tools/gbench/report.py
-+++ b/lib/benchmark/tools/gbench/report.py
-@@ -1,9 +1,11 @@
--import unittest
- """report.py - Utilities for reporting statistics about benchmark results
- """
-+
-+import unittest
- import os
- import re
- import copy
-+import random
- 
- from scipy.stats import mannwhitneyu
- 
-@@ -154,6 +156,7 @@ def extract_field(partition, field_name):
-     rhs = [x[field_name] for x in partition[1]]
-     return [lhs, rhs]
- 
-+
- def calc_utest(timings_cpu, timings_time):
-     min_rep_cnt = min(len(timings_time[0]),
-                       len(timings_time[1]),
-@@ -171,46 +174,106 @@ def calc_utest(timings_cpu, timings_time):
- 
-     return (min_rep_cnt >= UTEST_OPTIMAL_REPETITIONS), cpu_pvalue, time_pvalue
- 
--def print_utest(partition, utest_alpha, first_col_width, use_color=True):
-+def print_utest(bc_name, utest, utest_alpha, first_col_width, use_color=True):
-     def get_utest_color(pval):
-         return BC_FAIL if pval >= utest_alpha else BC_OKGREEN
- 
--    timings_time = extract_field(partition, 'real_time')
--    timings_cpu = extract_field(partition, 'cpu_time')
--    have_optimal_repetitions, cpu_pvalue, time_pvalue = calc_utest(timings_cpu, timings_time)
--
-     # Check if we failed miserably with minimum required repetitions for utest
--    if not have_optimal_repetitions and cpu_pvalue is None and time_pvalue is None:
-+    if not utest['have_optimal_repetitions'] and utest['cpu_pvalue'] is None and utest['time_pvalue'] is None:
-         return []
- 
-     dsc = "U Test, Repetitions: {} vs {}".format(
--        len(timings_cpu[0]), len(timings_cpu[1]))
-+        utest['nr_of_repetitions'], utest['nr_of_repetitions_other'])
-     dsc_color = BC_OKGREEN
- 
-     # We still got some results to show but issue a warning about it.
--    if not have_optimal_repetitions:
-+    if not utest['have_optimal_repetitions']:
-         dsc_color = BC_WARNING
-         dsc += ". WARNING: Results unreliable! {}+ repetitions recommended.".format(
-             UTEST_OPTIMAL_REPETITIONS)
- 
-     special_str = "{}{:<{}s}{endc}{}{:16.4f}{endc}{}{:16.4f}{endc}{}      {}"
- 
--    last_name = partition[0][0]['name']
-     return [color_format(use_color,
-                          special_str,
-                          BC_HEADER,
--                         "{}{}".format(last_name, UTEST_COL_NAME),
-+                         "{}{}".format(bc_name, UTEST_COL_NAME),
-                          first_col_width,
--                         get_utest_color(time_pvalue), time_pvalue,
--                         get_utest_color(cpu_pvalue), cpu_pvalue,
-+                         get_utest_color(
-+                             utest['time_pvalue']), utest['time_pvalue'],
-+                         get_utest_color(
-+                             utest['cpu_pvalue']), utest['cpu_pvalue'],
-                          dsc_color, dsc,
-                          endc=BC_ENDC)]
- 
- 
--def generate_difference_report(
-+def get_difference_report(
-         json1,
-         json2,
--        display_aggregates_only=False,
-+        utest=False):
-+    """
-+    Calculate and report the difference between each test of two benchmarks
-+    runs specified as 'json1' and 'json2'. Output is another json containing
-+    relevant details for each test run.
-+    """
-+    assert utest is True or utest is False
-+
-+    diff_report = []
-+    partitions = partition_benchmarks(json1, json2)
-+    for partition in partitions:
-+        benchmark_name = partition[0][0]['name']
-+        time_unit = partition[0][0]['time_unit']
-+        measurements = []
-+        utest_results = {}
-+        # Careful, we may have different repetition count.
-+        for i in range(min(len(partition[0]), len(partition[1]))):
-+            bn = partition[0][i]
-+            other_bench = partition[1][i]
-+            measurements.append({
-+                'real_time': bn['real_time'],
-+                'cpu_time': bn['cpu_time'],
-+                'real_time_other': other_bench['real_time'],
-+                'cpu_time_other': other_bench['cpu_time'],
-+                'time': calculate_change(bn['real_time'], other_bench['real_time']),
-+                'cpu': calculate_change(bn['cpu_time'], other_bench['cpu_time'])
-+            })
-+
-+        # After processing the whole partition, if requested, do the U test.
-+        if utest:
-+            timings_cpu = extract_field(partition, 'cpu_time')
-+            timings_time = extract_field(partition, 'real_time')
-+            have_optimal_repetitions, cpu_pvalue, time_pvalue = calc_utest(timings_cpu, timings_time)
-+            if cpu_pvalue and time_pvalue:
-+                utest_results = {
-+                    'have_optimal_repetitions': have_optimal_repetitions,
-+                    'cpu_pvalue': cpu_pvalue,
-+                    'time_pvalue': time_pvalue,
-+                    'nr_of_repetitions': len(timings_cpu[0]),
-+                    'nr_of_repetitions_other': len(timings_cpu[1])
-+                }
-+
-+        # Store only if we had any measurements for given benchmark.
-+        # E.g. partition_benchmarks will filter out the benchmarks having
-+        # time units which are not compatible with other time units in the
-+        # benchmark suite.
-+        if measurements:
-+            run_type = partition[0][0]['run_type'] if 'run_type' in partition[0][0] else ''
-+            aggregate_name = partition[0][0]['aggregate_name'] if run_type == 'aggregate' and 'aggregate_name' in partition[0][0] else ''
-+            diff_report.append({
-+                'name': benchmark_name,
-+                'measurements': measurements,
-+                'time_unit': time_unit,
-+                'run_type': run_type,
-+                'aggregate_name': aggregate_name,
-+                'utest': utest_results
-+            })
-+
-+    return diff_report
-+
-+
-+def print_difference_report(
-+        json_diff_report,
-+        include_aggregates_only=False,
-         utest=False,
-         utest_alpha=0.05,
-         use_color=True):
-@@ -219,14 +282,16 @@ def generate_difference_report(
-     runs specified as 'json1' and 'json2'.
-     """
-     assert utest is True or utest is False
--    first_col_width = find_longest_name(json1['benchmarks'])
- 
--    def find_test(name):
--        for b in json2['benchmarks']:
--            if b['name'] == name:
--                return b
--        return None
-+    def get_color(res):
-+        if res > 0.05:
-+            return BC_FAIL
-+        elif res > -0.07:
-+            return BC_WHITE
-+        else:
-+            return BC_CYAN
- 
-+    first_col_width = find_longest_name(json_diff_report)
-     first_col_width = max(
-         first_col_width,
-         len('Benchmark'))
-@@ -235,50 +300,33 @@ def generate_difference_report(
-         'Benchmark', 12 + first_col_width)
-     output_strs = [first_line, '-' * len(first_line)]
- 
--    partitions = partition_benchmarks(json1, json2)
--    for partition in partitions:
--        # Careful, we may have different repetition count.
--        for i in range(min(len(partition[0]), len(partition[1]))):
--            bn = partition[0][i]
--            other_bench = partition[1][i]
--
--            # *If* we were asked to only display aggregates,
--            # and if it is non-aggregate, then skip it.
--            if display_aggregates_only and 'run_type' in bn and 'run_type' in other_bench:
--                assert bn['run_type'] == other_bench['run_type']
--                if bn['run_type'] != 'aggregate':
--                    continue
--
--            fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}"
--
--            def get_color(res):
--                if res > 0.05:
--                    return BC_FAIL
--                elif res > -0.07:
--                    return BC_WHITE
--                else:
--                    return BC_CYAN
--
--            tres = calculate_change(bn['real_time'], other_bench['real_time'])
--            cpures = calculate_change(bn['cpu_time'], other_bench['cpu_time'])
--            output_strs += [color_format(use_color,
--                                         fmt_str,
--                                         BC_HEADER,
--                                         bn['name'],
--                                         first_col_width,
--                                         get_color(tres),
--                                         tres,
--                                         get_color(cpures),
--                                         cpures,
--                                         bn['real_time'],
--                                         other_bench['real_time'],
--                                         bn['cpu_time'],
--                                         other_bench['cpu_time'],
--                                         endc=BC_ENDC)]
--
--        # After processing the whole partition, if requested, do the U test.
--        if utest:
--            output_strs += print_utest(partition,
-+    fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}"
-+    for benchmark in json_diff_report:
-+        # *If* we were asked to only include aggregates,
-+        # and if it is non-aggregate, then don't print it.
-+        if not include_aggregates_only or not 'run_type' in benchmark or benchmark['run_type'] == 'aggregate':
-+            for measurement in benchmark['measurements']:
-+                output_strs += [color_format(use_color,
-+                                            fmt_str,
-+                                            BC_HEADER,
-+                                            benchmark['name'],
-+                                            first_col_width,
-+                                            get_color(measurement['time']),
-+                                            measurement['time'],
-+                                            get_color(measurement['cpu']),
-+                                            measurement['cpu'],
-+                                            measurement['real_time'],
-+                                            measurement['real_time_other'],
-+                                            measurement['cpu_time'],
-+                                            measurement['cpu_time_other'],
-+                                            endc=BC_ENDC)]
-+
-+        # After processing the measurements, if requested and
-+        # if applicable (e.g. u-test exists for given benchmark),
-+        # print the U test.
-+        if utest and benchmark['utest']:
-+            output_strs += print_utest(benchmark['name'],
-+                                       benchmark['utest'],
-                                        utest_alpha=utest_alpha,
-                                        first_col_width=first_col_width,
-                                        use_color=use_color)
-@@ -319,21 +367,26 @@ class TestGetUniqueBenchmarkNames(unittest.TestCase):
- 
- 
- class TestReportDifference(unittest.TestCase):
--    def load_results(self):
--        import json
--        testInputs = os.path.join(
--            os.path.dirname(
--                os.path.realpath(__file__)),
--            'Inputs')
--        testOutput1 = os.path.join(testInputs, 'test1_run1.json')
--        testOutput2 = os.path.join(testInputs, 'test1_run2.json')
--        with open(testOutput1, 'r') as f:
--            json1 = json.load(f)
--        with open(testOutput2, 'r') as f:
--            json2 = json.load(f)
--        return json1, json2
--
--    def test_basic(self):
-+    @classmethod
-+    def setUpClass(cls):
-+        def load_results():
-+            import json
-+            testInputs = os.path.join(
-+                os.path.dirname(
-+                    os.path.realpath(__file__)),
-+                'Inputs')
-+            testOutput1 = os.path.join(testInputs, 'test1_run1.json')
-+            testOutput2 = os.path.join(testInputs, 'test1_run2.json')
-+            with open(testOutput1, 'r') as f:
-+                json1 = json.load(f)
-+            with open(testOutput2, 'r') as f:
-+                json2 = json.load(f)
-+            return json1, json2
-+
-+        json1, json2 = load_results()
-+        cls.json_diff_report = get_difference_report(json1, json2)
-+
-+    def test_json_diff_report_pretty_printing(self):
-         expect_lines = [
-             ['BM_SameTimes', '+0.0000', '+0.0000', '10', '10', '10', '10'],
-             ['BM_2xFaster', '-0.5000', '-0.5000', '50', '25', '50', '25'],
-@@ -351,9 +404,8 @@ class TestReportDifference(unittest.TestCase):
-             ['BM_ThirdFaster', '-0.3333', '-0.3334', '100', '67', '100', '67'],
-             ['BM_NotBadTimeUnit', '-0.9000', '+0.2000', '0', '0', '0', '1'],
-         ]
--        json1, json2 = self.load_results()
--        output_lines_with_header = generate_difference_report(
--            json1, json2, use_color=False)
-+        output_lines_with_header = print_difference_report(
-+            self.json_diff_report, use_color=False)
-         output_lines = output_lines_with_header[2:]
-         print("\n")
-         print("\n".join(output_lines_with_header))
-@@ -363,31 +415,118 @@ class TestReportDifference(unittest.TestCase):
-             self.assertEqual(len(parts), 7)
-             self.assertEqual(expect_lines[i], parts)
- 
-+    def test_json_diff_report_output(self):
-+        expected_output = [
-+            {
-+                'name': 'BM_SameTimes',
-+                'measurements': [{'time': 0.0000, 'cpu': 0.0000, 'real_time': 10, 'real_time_other': 10, 'cpu_time': 10, 'cpu_time_other': 10}],
-+                'time_unit': 'ns',
-+                'utest': {}
-+            },
-+            {
-+                'name': 'BM_2xFaster',
-+                'measurements': [{'time': -0.5000, 'cpu': -0.5000, 'real_time': 50, 'real_time_other': 25, 'cpu_time': 50, 'cpu_time_other': 25}],
-+                'time_unit': 'ns',
-+                'utest': {}
-+            },
-+            {
-+                'name': 'BM_2xSlower',
-+                'measurements': [{'time': 1.0000, 'cpu': 1.0000, 'real_time': 50, 'real_time_other': 100, 'cpu_time': 50, 'cpu_time_other': 100}],
-+                'time_unit': 'ns',
-+                'utest': {}
-+            },
-+            {
-+                'name': 'BM_1PercentFaster',
-+                'measurements': [{'time': -0.0100, 'cpu': -0.0100, 'real_time': 100, 'real_time_other': 98.9999999, 'cpu_time': 100, 'cpu_time_other': 98.9999999}],
-+                'time_unit': 'ns',
-+                'utest': {}
-+            },
-+            {
-+                'name': 'BM_1PercentSlower',
-+                'measurements': [{'time': 0.0100, 'cpu': 0.0100, 'real_time': 100, 'real_time_other': 101, 'cpu_time': 100, 'cpu_time_other': 101}],
-+                'time_unit': 'ns',
-+                'utest': {}
-+            },
-+            {
-+                'name': 'BM_10PercentFaster',
-+                'measurements': [{'time': -0.1000, 'cpu': -0.1000, 'real_time': 100, 'real_time_other': 90, 'cpu_time': 100, 'cpu_time_other': 90}],
-+                'time_unit': 'ns',
-+                'utest': {}
-+            },
-+            {
-+                'name': 'BM_10PercentSlower',
-+                'measurements': [{'time': 0.1000, 'cpu': 0.1000, 'real_time': 100, 'real_time_other': 110, 'cpu_time': 100, 'cpu_time_other': 110}],
-+                'time_unit': 'ns',
-+                'utest': {}
-+            },
-+            {
-+                'name': 'BM_100xSlower',
-+                'measurements': [{'time': 99.0000, 'cpu': 99.0000, 'real_time': 100, 'real_time_other': 10000, 'cpu_time': 100, 'cpu_time_other': 10000}],
-+                'time_unit': 'ns',
-+                'utest': {}
-+            },
-+            {
-+                'name': 'BM_100xFaster',
-+                'measurements': [{'time': -0.9900, 'cpu': -0.9900, 'real_time': 10000, 'real_time_other': 100, 'cpu_time': 10000, 'cpu_time_other': 100}],
-+                'time_unit': 'ns',
-+                'utest': {}
-+            },
-+            {
-+                'name': 'BM_10PercentCPUToTime',
-+                'measurements': [{'time': 0.1000, 'cpu': -0.1000, 'real_time': 100, 'real_time_other': 110, 'cpu_time': 100, 'cpu_time_other': 90}],
-+                'time_unit': 'ns',
-+                'utest': {}
-+            },
-+            {
-+                'name': 'BM_ThirdFaster',
-+                'measurements': [{'time': -0.3333, 'cpu': -0.3334, 'real_time': 100, 'real_time_other': 67, 'cpu_time': 100, 'cpu_time_other': 67}],
-+                'time_unit': 'ns',
-+                'utest': {}
-+            },
-+            {
-+                'name': 'BM_NotBadTimeUnit',
-+                'measurements': [{'time': -0.9000, 'cpu': 0.2000, 'real_time': 0.4, 'real_time_other': 0.04, 'cpu_time': 0.5, 'cpu_time_other': 0.6}],
-+                'time_unit': 's',
-+                'utest': {}
-+            },
-+        ]
-+        self.assertEqual(len(self.json_diff_report), len(expected_output))
-+        for out, expected in zip(
-+                self.json_diff_report, expected_output):
-+            self.assertEqual(out['name'], expected['name'])
-+            self.assertEqual(out['time_unit'], expected['time_unit'])
-+            assert_utest(self, out, expected)
-+            assert_measurements(self, out, expected)
-+
- 
- class TestReportDifferenceBetweenFamilies(unittest.TestCase):
--    def load_result(self):
--        import json
--        testInputs = os.path.join(
--            os.path.dirname(
--                os.path.realpath(__file__)),
--            'Inputs')
--        testOutput = os.path.join(testInputs, 'test2_run.json')
--        with open(testOutput, 'r') as f:
--            json = json.load(f)
--        return json
-+    @classmethod
-+    def setUpClass(cls):
-+        def load_result():
-+            import json
-+            testInputs = os.path.join(
-+                os.path.dirname(
-+                    os.path.realpath(__file__)),
-+                'Inputs')
-+            testOutput = os.path.join(testInputs, 'test2_run.json')
-+            with open(testOutput, 'r') as f:
-+                json = json.load(f)
-+            return json
-+
-+        json = load_result()
-+        json1 = filter_benchmark(json, "BM_Z.ro", ".")
-+        json2 = filter_benchmark(json, "BM_O.e", ".")
-+        cls.json_diff_report = get_difference_report(json1, json2)
- 
--    def test_basic(self):
-+    def test_json_diff_report_pretty_printing(self):
-         expect_lines = [
-             ['.', '-0.5000', '-0.5000', '10', '5', '10', '5'],
-             ['./4', '-0.5000', '-0.5000', '40', '20', '40', '20'],
-             ['Prefix/.', '-0.5000', '-0.5000', '20', '10', '20', '10'],
-             ['Prefix/./3', '-0.5000', '-0.5000', '30', '15', '30', '15'],
-         ]
--        json = self.load_result()
--        json1 = filter_benchmark(json, "BM_Z.ro", ".")
--        json2 = filter_benchmark(json, "BM_O.e", ".")
--        output_lines_with_header = generate_difference_report(
--            json1, json2, use_color=False)
-+        output_lines_with_header = print_difference_report(
-+            self.json_diff_report, use_color=False)
-         output_lines = output_lines_with_header[2:]
-         print("\n")
-         print("\n".join(output_lines_with_header))
-@@ -397,31 +536,71 @@ class TestReportDifferenceBetweenFamilies(unittest.TestCase):
-             self.assertEqual(len(parts), 7)
-             self.assertEqual(expect_lines[i], parts)
- 
-+    def test_json_diff_report(self):
-+        expected_output = [
-+            {
-+                'name': u'.',
-+                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 10, 'real_time_other': 5, 'cpu_time': 10, 'cpu_time_other': 5}],
-+                'time_unit': 'ns',
-+                'utest': {}
-+            },
-+            {
-+                'name': u'./4',
-+                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 40, 'real_time_other': 20, 'cpu_time': 40, 'cpu_time_other': 20}],
-+                'time_unit': 'ns',
-+                'utest': {},
-+            },
-+            {
-+                'name': u'Prefix/.',
-+                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 20, 'real_time_other': 10, 'cpu_time': 20, 'cpu_time_other': 10}],
-+                'time_unit': 'ns',
-+                'utest': {}
-+            },
-+            {
-+                'name': u'Prefix/./3',
-+                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 30, 'real_time_other': 15, 'cpu_time': 30, 'cpu_time_other': 15}],
-+                'time_unit': 'ns',
-+                'utest': {}
-+            }
-+        ]
-+        self.assertEqual(len(self.json_diff_report), len(expected_output))
-+        for out, expected in zip(
-+                self.json_diff_report, expected_output):
-+            self.assertEqual(out['name'], expected['name'])
-+            self.assertEqual(out['time_unit'], expected['time_unit'])
-+            assert_utest(self, out, expected)
-+            assert_measurements(self, out, expected)
-+
- 
- class TestReportDifferenceWithUTest(unittest.TestCase):
--    def load_results(self):
--        import json
--        testInputs = os.path.join(
--            os.path.dirname(
--                os.path.realpath(__file__)),
--            'Inputs')
--        testOutput1 = os.path.join(testInputs, 'test3_run0.json')
--        testOutput2 = os.path.join(testInputs, 'test3_run1.json')
--        with open(testOutput1, 'r') as f:
--            json1 = json.load(f)
--        with open(testOutput2, 'r') as f:
--            json2 = json.load(f)
--        return json1, json2
--
--    def test_utest(self):
--        expect_lines = []
-+    @classmethod
-+    def setUpClass(cls):
-+        def load_results():
-+            import json
-+            testInputs = os.path.join(
-+                os.path.dirname(
-+                    os.path.realpath(__file__)),
-+                'Inputs')
-+            testOutput1 = os.path.join(testInputs, 'test3_run0.json')
-+            testOutput2 = os.path.join(testInputs, 'test3_run1.json')
-+            with open(testOutput1, 'r') as f:
-+                json1 = json.load(f)
-+            with open(testOutput2, 'r') as f:
-+                json2 = json.load(f)
-+            return json1, json2
-+
-+        json1, json2 = load_results()
-+        cls.json_diff_report = get_difference_report(
-+            json1, json2, utest=True)
-+
-+    def test_json_diff_report_pretty_printing(self):
-         expect_lines = [
-             ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
-             ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
-             ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
-             ['BM_Two_pvalue',
--             '0.6985',
--             '0.6985',
-+             '1.0000',
-+             '0.6667',
-              'U',
-              'Test,',
-              'Repetitions:',
-@@ -438,7 +617,7 @@ class TestReportDifferenceWithUTest(unittest.TestCase):
-             ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
-             ['short_pvalue',
-              '0.7671',
--             '0.1489',
-+             '0.2000',
-              'U',
-              'Test,',
-              'Repetitions:',
-@@ -453,9 +632,54 @@ class TestReportDifferenceWithUTest(unittest.TestCase):
-              'recommended.'],
-             ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
-         ]
--        json1, json2 = self.load_results()
--        output_lines_with_header = generate_difference_report(
--            json1, json2, utest=True, utest_alpha=0.05, use_color=False)
-+        output_lines_with_header = print_difference_report(
-+            self.json_diff_report, utest=True, utest_alpha=0.05, use_color=False)
-+        output_lines = output_lines_with_header[2:]
-+        print("\n")
-+        print("\n".join(output_lines_with_header))
-+        self.assertEqual(len(output_lines), len(expect_lines))
-+        for i in range(0, len(output_lines)):
-+            parts = [x for x in output_lines[i].split(' ') if x]
-+            self.assertEqual(expect_lines[i], parts)
-+
-+    def test_json_diff_report_pretty_printing_aggregates_only(self):
-+        expect_lines = [
-+            ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
-+            ['BM_Two_pvalue',
-+             '1.0000',
-+             '0.6667',
-+             'U',
-+             'Test,',
-+             'Repetitions:',
-+             '2',
-+             'vs',
-+             '2.',
-+             'WARNING:',
-+             'Results',
-+             'unreliable!',
-+             '9+',
-+             'repetitions',
-+             'recommended.'],
-+            ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
-+            ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
-+            ['short_pvalue',
-+             '0.7671',
-+             '0.2000',
-+             'U',
-+             'Test,',
-+             'Repetitions:',
-+             '2',
-+             'vs',
-+             '3.',
-+             'WARNING:',
-+             'Results',
-+             'unreliable!',
-+             '9+',
-+             'repetitions',
-+             'recommended.'],
-+        ]
-+        output_lines_with_header = print_difference_report(
-+            self.json_diff_report, include_aggregates_only=True, utest=True, utest_alpha=0.05, use_color=False)
-         output_lines = output_lines_with_header[2:]
-         print("\n")
-         print("\n".join(output_lines_with_header))
-@@ -464,32 +688,112 @@ class TestReportDifferenceWithUTest(unittest.TestCase):
-             parts = [x for x in output_lines[i].split(' ') if x]
-             self.assertEqual(expect_lines[i], parts)
- 
-+    def test_json_diff_report(self):
-+        expected_output = [
-+            {
-+                'name': u'BM_One',
-+                'measurements': [
-+                    {'time': -0.1,
-+                     'cpu': 0.1,
-+                     'real_time': 10,
-+                     'real_time_other': 9,
-+                     'cpu_time': 100,
-+                     'cpu_time_other': 110}
-+                ],
-+                'time_unit': 'ns',
-+                'utest': {}
-+            },
-+            {
-+                'name': u'BM_Two',
-+                'measurements': [
-+                    {'time': 0.1111111111111111,
-+                     'cpu': -0.011111111111111112,
-+                     'real_time': 9,
-+                     'real_time_other': 10,
-+                     'cpu_time': 90,
-+                     'cpu_time_other': 89},
-+                    {'time': -0.125, 'cpu': -0.16279069767441862, 'real_time': 8,
-+                        'real_time_other': 7, 'cpu_time': 86, 'cpu_time_other': 72}
-+                ],
-+                'time_unit': 'ns',
-+                'utest': {
-+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.6666666666666666, 'time_pvalue': 1.0
-+                }
-+            },
-+            {
-+                'name': u'short',
-+                'measurements': [
-+                    {'time': -0.125,
-+                     'cpu': -0.0625,
-+                     'real_time': 8,
-+                     'real_time_other': 7,
-+                     'cpu_time': 80,
-+                     'cpu_time_other': 75},
-+                    {'time': -0.4325,
-+                     'cpu': -0.13506493506493514,
-+                     'real_time': 8,
-+                     'real_time_other': 4.54,
-+                     'cpu_time': 77,
-+                     'cpu_time_other': 66.6}
-+                ],
-+                'time_unit': 'ns',
-+                'utest': {
-+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.2, 'time_pvalue': 0.7670968684102772
-+                }
-+            },
-+            {
-+                'name': u'medium',
-+                'measurements': [
-+                    {'time': -0.375,
-+                     'cpu': -0.3375,
-+                     'real_time': 8,
-+                     'real_time_other': 5,
-+                     'cpu_time': 80,
-+                     'cpu_time_other': 53}
-+                ],
-+                'time_unit': 'ns',
-+                'utest': {}
-+            }
-+        ]
-+        self.assertEqual(len(self.json_diff_report), len(expected_output))
-+        for out, expected in zip(
-+                self.json_diff_report, expected_output):
-+            self.assertEqual(out['name'], expected['name'])
-+            self.assertEqual(out['time_unit'], expected['time_unit'])
-+            assert_utest(self, out, expected)
-+            assert_measurements(self, out, expected)
-+
- 
- class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
-         unittest.TestCase):
--    def load_results(self):
--        import json
--        testInputs = os.path.join(
--            os.path.dirname(
--                os.path.realpath(__file__)),
--            'Inputs')
--        testOutput1 = os.path.join(testInputs, 'test3_run0.json')
--        testOutput2 = os.path.join(testInputs, 'test3_run1.json')
--        with open(testOutput1, 'r') as f:
--            json1 = json.load(f)
--        with open(testOutput2, 'r') as f:
--            json2 = json.load(f)
--        return json1, json2
--
--    def test_utest(self):
--        expect_lines = []
-+    @classmethod
-+    def setUpClass(cls):
-+        def load_results():
-+            import json
-+            testInputs = os.path.join(
-+                os.path.dirname(
-+                    os.path.realpath(__file__)),
-+                'Inputs')
-+            testOutput1 = os.path.join(testInputs, 'test3_run0.json')
-+            testOutput2 = os.path.join(testInputs, 'test3_run1.json')
-+            with open(testOutput1, 'r') as f:
-+                json1 = json.load(f)
-+            with open(testOutput2, 'r') as f:
-+                json2 = json.load(f)
-+            return json1, json2
-+
-+        json1, json2 = load_results()
-+        cls.json_diff_report = get_difference_report(
-+            json1, json2, utest=True)
-+
-+    def test_json_diff_report_pretty_printing(self):
-         expect_lines = [
-             ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
-             ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
-             ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
-             ['BM_Two_pvalue',
--             '0.6985',
--             '0.6985',
-+             '1.0000',
-+             '0.6667',
-              'U',
-              'Test,',
-              'Repetitions:',
-@@ -506,7 +810,7 @@ class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
-             ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
-             ['short_pvalue',
-              '0.7671',
--             '0.1489',
-+             '0.2000',
-              'U',
-              'Test,',
-              'Repetitions:',
-@@ -519,10 +823,126 @@ class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
-              '9+',
-              'repetitions',
-              'recommended.'],
-+             ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53']
-+        ]
-+        output_lines_with_header = print_difference_report(
-+            self.json_diff_report,
-+            utest=True, utest_alpha=0.05, use_color=False)
-+        output_lines = output_lines_with_header[2:]
-+        print("\n")
-+        print("\n".join(output_lines_with_header))
-+        self.assertEqual(len(output_lines), len(expect_lines))
-+        for i in range(0, len(output_lines)):
-+            parts = [x for x in output_lines[i].split(' ') if x]
-+            self.assertEqual(expect_lines[i], parts)
-+
-+    def test_json_diff_report(self):
-+        expected_output = [
-+            {
-+                'name': u'BM_One',
-+                'measurements': [
-+                    {'time': -0.1,
-+                     'cpu': 0.1,
-+                     'real_time': 10,
-+                     'real_time_other': 9,
-+                     'cpu_time': 100,
-+                     'cpu_time_other': 110}
-+                ],
-+                'time_unit': 'ns',
-+                'utest': {}
-+            },
-+            {
-+                'name': u'BM_Two',
-+                'measurements': [
-+                    {'time': 0.1111111111111111,
-+                     'cpu': -0.011111111111111112,
-+                     'real_time': 9,
-+                     'real_time_other': 10,
-+                     'cpu_time': 90,
-+                     'cpu_time_other': 89},
-+                    {'time': -0.125, 'cpu': -0.16279069767441862, 'real_time': 8,
-+                        'real_time_other': 7, 'cpu_time': 86, 'cpu_time_other': 72}
-+                ],
-+                'time_unit': 'ns',
-+                'utest': {
-+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.6666666666666666, 'time_pvalue': 1.0
-+                }
-+            },
-+            {
-+                'name': u'short',
-+                'measurements': [
-+                    {'time': -0.125,
-+                     'cpu': -0.0625,
-+                     'real_time': 8,
-+                     'real_time_other': 7,
-+                     'cpu_time': 80,
-+                     'cpu_time_other': 75},
-+                    {'time': -0.4325,
-+                     'cpu': -0.13506493506493514,
-+                     'real_time': 8,
-+                     'real_time_other': 4.54,
-+                     'cpu_time': 77,
-+                     'cpu_time_other': 66.6}
-+                ],
-+                'time_unit': 'ns',
-+                'utest': {
-+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.2, 'time_pvalue': 0.7670968684102772
-+                }
-+            },
-+            {
-+                'name': u'medium',
-+                'measurements': [
-+                    {'real_time_other': 5,
-+                     'cpu_time': 80,
-+                     'time': -0.375,
-+                     'real_time': 8,
-+                     'cpu_time_other': 53,
-+                     'cpu': -0.3375
-+                    }
-+                ],
-+                'utest': {},
-+                'time_unit': u'ns',
-+                'aggregate_name': ''
-+            }
-+        ]
-+        self.assertEqual(len(self.json_diff_report), len(expected_output))
-+        for out, expected in zip(
-+                self.json_diff_report, expected_output):
-+            self.assertEqual(out['name'], expected['name'])
-+            self.assertEqual(out['time_unit'], expected['time_unit'])
-+            assert_utest(self, out, expected)
-+            assert_measurements(self, out, expected)
-+
-+
-+
-+class TestReportDifferenceForPercentageAggregates(
-+        unittest.TestCase):
-+    @classmethod
-+    def setUpClass(cls):
-+        def load_results():
-+            import json
-+            testInputs = os.path.join(
-+                os.path.dirname(
-+                    os.path.realpath(__file__)),
-+                'Inputs')
-+            testOutput1 = os.path.join(testInputs, 'test4_run0.json')
-+            testOutput2 = os.path.join(testInputs, 'test4_run1.json')
-+            with open(testOutput1, 'r') as f:
-+                json1 = json.load(f)
-+            with open(testOutput2, 'r') as f:
-+                json2 = json.load(f)
-+            return json1, json2
-+
-+        json1, json2 = load_results()
-+        cls.json_diff_report = get_difference_report(
-+            json1, json2, utest=True)
-+
-+    def test_json_diff_report_pretty_printing(self):
-+        expect_lines = [
-+            ['whocares', '-0.5000', '+0.5000', '0', '0', '0', '0']
-         ]
--        json1, json2 = self.load_results()
--        output_lines_with_header = generate_difference_report(
--            json1, json2, display_aggregates_only=True,
-+        output_lines_with_header = print_difference_report(
-+            self.json_diff_report,
-             utest=True, utest_alpha=0.05, use_color=False)
-         output_lines = output_lines_with_header[2:]
-         print("\n")
-@@ -532,6 +952,99 @@ class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
-             parts = [x for x in output_lines[i].split(' ') if x]
-             self.assertEqual(expect_lines[i], parts)
- 
-+    def test_json_diff_report(self):
-+        expected_output = [
-+            {
-+                'name': u'whocares',
-+                'measurements': [
-+                    {'time': -0.5,
-+                     'cpu': 0.5,
-+                     'real_time': 0.01,
-+                     'real_time_other': 0.005,
-+                     'cpu_time': 0.10,
-+                     'cpu_time_other': 0.15}
-+                ],
-+                'time_unit': 'ns',
-+                'utest': {}
-+            }
-+        ]
-+        self.assertEqual(len(self.json_diff_report), len(expected_output))
-+        for out, expected in zip(
-+                self.json_diff_report, expected_output):
-+            self.assertEqual(out['name'], expected['name'])
-+            self.assertEqual(out['time_unit'], expected['time_unit'])
-+            assert_utest(self, out, expected)
-+            assert_measurements(self, out, expected)
-+
-+
-+class TestReportSorting(unittest.TestCase):
-+    @classmethod
-+    def setUpClass(cls):
-+        def load_result():
-+            import json
-+            testInputs = os.path.join(
-+                os.path.dirname(
-+                    os.path.realpath(__file__)),
-+                'Inputs')
-+            testOutput = os.path.join(testInputs, 'test4_run.json')
-+            with open(testOutput, 'r') as f:
-+                json = json.load(f)
-+            return json
-+
-+        cls.json = load_result()
-+
-+    def test_json_diff_report_pretty_printing(self):
-+        import util
-+
-+        expected_names = [
-+            "99 family 0 instance 0 repetition 0",
-+            "98 family 0 instance 0 repetition 1",
-+            "97 family 0 instance 0 aggregate",
-+            "96 family 0 instance 1 repetition 0",
-+            "95 family 0 instance 1 repetition 1",
-+            "94 family 0 instance 1 aggregate",
-+            "93 family 1 instance 0 repetition 0",
-+            "92 family 1 instance 0 repetition 1",
-+            "91 family 1 instance 0 aggregate",
-+            "90 family 1 instance 1 repetition 0",
-+            "89 family 1 instance 1 repetition 1",
-+            "88 family 1 instance 1 aggregate"
-+        ]
-+
-+        for n in range(len(self.json['benchmarks']) ** 2):
-+            random.shuffle(self.json['benchmarks'])
-+            sorted_benchmarks = util.sort_benchmark_results(self.json)[
-+                'benchmarks']
-+            self.assertEqual(len(expected_names), len(sorted_benchmarks))
-+            for out, expected in zip(sorted_benchmarks, expected_names):
-+                self.assertEqual(out['name'], expected)
-+
-+
-+def assert_utest(unittest_instance, lhs, rhs):
-+    if lhs['utest']:
-+        unittest_instance.assertAlmostEqual(
-+            lhs['utest']['cpu_pvalue'],
-+            rhs['utest']['cpu_pvalue'])
-+        unittest_instance.assertAlmostEqual(
-+            lhs['utest']['time_pvalue'],
-+            rhs['utest']['time_pvalue'])
-+        unittest_instance.assertEqual(
-+            lhs['utest']['have_optimal_repetitions'],
-+            rhs['utest']['have_optimal_repetitions'])
-+    else:
-+        # lhs is empty. assert if rhs is not.
-+        unittest_instance.assertEqual(lhs['utest'], rhs['utest'])
-+
-+
-+def assert_measurements(unittest_instance, lhs, rhs):
-+    for m1, m2 in zip(lhs['measurements'], rhs['measurements']):
-+        unittest_instance.assertEqual(m1['real_time'], m2['real_time'])
-+        unittest_instance.assertEqual(m1['cpu_time'], m2['cpu_time'])
-+        # m1['time'] and m1['cpu'] hold values which are being calculated,
-+        # and therefore we must use almost-equal pattern.
-+        unittest_instance.assertAlmostEqual(m1['time'], m2['time'], places=4)
-+        unittest_instance.assertAlmostEqual(m1['cpu'], m2['cpu'], places=4)
-+
- 
- if __name__ == '__main__':
-     unittest.main()
-diff --git a/lib/benchmark/tools/gbench/util.py b/lib/benchmark/tools/gbench/util.py
-index 1f8e8e2c47..5d0012c0cb 100644
---- a/lib/benchmark/tools/gbench/util.py
-+++ b/lib/benchmark/tools/gbench/util.py
-@@ -5,6 +5,7 @@ import os
- import tempfile
- import subprocess
- import sys
-+import functools
- 
- # Input file type enumeration
- IT_Invalid = 0
-@@ -119,6 +120,23 @@ def load_benchmark_results(fname):
-         return json.load(f)
- 
- 
-+def sort_benchmark_results(result):
-+    benchmarks = result['benchmarks']
-+
-+    # From inner key to the outer key!
-+    benchmarks = sorted(
-+        benchmarks, key=lambda benchmark: benchmark['repetition_index'] if 'repetition_index' in benchmark else -1)
-+    benchmarks = sorted(
-+        benchmarks, key=lambda benchmark: 1 if 'run_type' in benchmark and benchmark['run_type'] == "aggregate" else 0)
-+    benchmarks = sorted(
-+        benchmarks, key=lambda benchmark: benchmark['per_family_instance_index'] if 'per_family_instance_index' in benchmark else -1)
-+    benchmarks = sorted(
-+        benchmarks, key=lambda benchmark: benchmark['family_index'] if 'family_index' in benchmark else -1)
-+
-+    result['benchmarks'] = benchmarks
-+    return result
-+
-+
- def run_benchmark(exe_name, benchmark_flags):
-     """
-     Run a benchmark specified by 'exe_name' with the specified
-@@ -158,7 +176,6 @@ def run_or_load_benchmark(filename, benchmark_flags):
-     ftype = check_input_file(filename)
-     if ftype == IT_JSON:
-         return load_benchmark_results(filename)
--    elif ftype == IT_Executable:
-+    if ftype == IT_Executable:
-         return run_benchmark(filename, benchmark_flags)
--    else:
--        assert False  # This branch is unreachable
-+    raise ValueError('Unknown file type %s' % ftype)
--- 
-2.31.1
-