From 05ff5b4d74198b39e1729e5e1f95207685cb428f Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Thu, 16 May 2019 00:17:06 -0700 Subject: [PATCH 1/8] Let test cover data sizes both above and below n --- Lib/test/test_statistics.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index 1922de5df4b0c5..3790ed44783123 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2161,17 +2161,18 @@ def test_specific_cases(self): # Quantiles should be idempotent if len(expected) >= 2: self.assertEqual(quantiles(expected, n=n), expected) - # Cross-check against other methods - if len(data) >= n: - # After end caps are added, method='inclusive' should - # give the same result as method='exclusive' whenever - # there are more data points than desired cut points. - padded_data = [min(data) - 1000] + data + [max(data) + 1000] - self.assertEqual( - quantiles(data, n=n), - quantiles(padded_data, n=n, method='inclusive'), - (n, data), - ) + # Cross-check against method='inclusive' which should give + # the same result after adding in minimum and maximum values + # extrapolated from the two lowest and two highest points. + sdata = sorted(data) + lo = 2 * sdata[0] - sdata[1] + hi = 2 * sdata[-1] - sdata[-2] + padded_data = data + [lo, hi] + self.assertEqual( + quantiles(data, n=n), + quantiles(padded_data, n=n, method='inclusive'), + (n, data), + ) # Invariant under tranlation and scaling def f(x): return 3.5 * x - 1234.675 From e0f12a4c8407ad2414351cc65cea000dcde4c3be Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Thu, 16 May 2019 00:48:21 -0700 Subject: [PATCH 2/8] Verify the Q2 agrees with median() for various data sizes --- Lib/test/test_statistics.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index 3790ed44783123..bf5f1067e7b912 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2189,6 +2189,11 @@ def f(x): actual = quantiles(statistics.NormalDist(), n=n) self.assertTrue(all(math.isclose(e, a, abs_tol=0.0001) for e, a in zip(expected, actual))) + # Q2 agrees with median() + for k in range(2, 60): + data = random.choices(range(100), k=k) + q1, q2, q3 = quantiles(data) + self.assertEqual(q2, statistics.median(data)) def test_specific_cases_inclusive(self): # Match results computed by hand and cross-checked @@ -2243,6 +2248,11 @@ def f(x): data.remove(max(data)) expected = quantiles(data, n=32) self.assertEqual(expected, actual) + # Q2 agrees with median() + for k in range(2, 60): + data = random.choices(range(100), k=k) + q1, q2, q3 = quantiles(data, method='inclusive') + self.assertEqual(q2, statistics.median(data)) def test_equal_inputs(self): quantiles = statistics.quantiles From df091f319fbc9c06a54af8c2e4e2d9f42d8e4106 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Thu, 16 May 2019 07:05:24 -0700 Subject: [PATCH 3/8] Test deciles --- Lib/test/test_statistics.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Lib/test/test_statistics.py b/Lib/test/test_statistics.py index bf5f1067e7b912..946c7428c61311 100644 --- a/Lib/test/test_statistics.py +++ b/Lib/test/test_statistics.py @@ -2239,6 +2239,11 @@ def f(x): actual = quantiles(statistics.NormalDist(), n=n, method="inclusive") self.assertTrue(all(math.isclose(e, a, abs_tol=0.0001) for e, a in zip(expected, actual))) + # Natural deciles + self.assertEqual(quantiles([0, 100], n=10, method='inclusive'), + [10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0]) + self.assertEqual(quantiles(range(0, 101), n=10, method='inclusive'), + [10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0]) # Whenever n is smaller than the number of data points, running # method='inclusive' should give the same result as method='exclusive' # after the two included extreme points are removed. From 50a92ab1c44c8faac4ce2c74b70cf5642a0c2b19 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sat, 18 May 2019 00:54:17 -0700 Subject: [PATCH 4/8] Elaborate on the *method* parameter --- Doc/library/statistics.rst | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index fb7df4e7188a07..dd80a1a46f8dce 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -522,11 +522,26 @@ However, for reading convenience, most of the examples show sorted sequences. value or compute ``106`` as the midpoint). This might matter if there are too few samples for a given number of cut points. - If *method* is set to *inclusive*, *dist* is treated as population data. - The minimum value is treated as the 0th percentile and the maximum - value is treated as the 100th percentile. If *dist* is an instance of - a class that defines an :meth:`~inv_cdf` method, setting *method* - has no effect. + The choice of *method* depends on whether dataset includes or + excludes the lowest and highest possible values from the + population. + + The *method* defaults to *exclusive*. This is used for data + sampled from a population with more extreme values than found in + the samples. The quantiles are computed with the assumption + that each data point, including the smallest and largest, + separates two continuous intervals with equal probability. + + When the *method* is set to *inclusive*, the minimum value in + *dist* is treated as the 0th percentile and the maximum value is + treated as the 100th percentile. This is useful for describing + population data. It also applies to sample data that is known to + include the most extreme possible values (such as a sample of test + scores including 0 and 100, the lowest and highest possible + scores). + + If *dist* is an instance of a class that defines an + :meth:`~inv_cdf` method, setting *method* has no effect. .. doctest:: From 1fee30bedfd1ff9d928d3ace6be42416008039fc Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sat, 18 May 2019 02:54:33 -0700 Subject: [PATCH 5/8] Make the inclusive/exclusive explanation more precise and concise --- Doc/library/statistics.rst | 39 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index dd80a1a46f8dce..2410a9ee309af5 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -517,28 +517,23 @@ However, for reading convenience, most of the examples show sorted sequences. For sample data, the cut points are linearly interpolated from the two nearest data points. For example, if a cut point falls one-third of the distance between two sample values, ``100`` and ``112``, the - cut-point will evaluate to ``104``. Other selection methods may be - offered in the future (for example choose ``100`` as the nearest - value or compute ``106`` as the midpoint). This might matter if - there are too few samples for a given number of cut points. - - The choice of *method* depends on whether dataset includes or - excludes the lowest and highest possible values from the - population. - - The *method* defaults to *exclusive*. This is used for data - sampled from a population with more extreme values than found in - the samples. The quantiles are computed with the assumption - that each data point, including the smallest and largest, - separates two continuous intervals with equal probability. - - When the *method* is set to *inclusive*, the minimum value in - *dist* is treated as the 0th percentile and the maximum value is - treated as the 100th percentile. This is useful for describing - population data. It also applies to sample data that is known to - include the most extreme possible values (such as a sample of test - scores including 0 and 100, the lowest and highest possible - scores). + cut-point will evaluate to ``104``. + + Quantiles can be computed differently depending on whether the data + in *dist* includes or excludes the lowest and highest possible values + from the population. + + The default *method* is *exclusive* and is used for data sampled from + a population that can have more extreme values than found in the + samples. The portion of the population falling below the *i-th* of + *m* data points is computed as ``i // (m + 1)``. + + Setting the *method* to *inclusive* is used for describing population + data or for samples that include the extreme points. The minimum + value in *dist* is treated as the 0th percentile and the maximum + value is treated as the 100th percentile. The portion of the + population falling below the *i-th* of *m* data points is computed as + ``(i - 1) // (m - 1)``. If *dist* is an instance of a class that defines an :meth:`~inv_cdf` method, setting *method* has no effect. From 3a17ee7e9a63d9632c6143ee6f1b4e5ff51882d5 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sat, 18 May 2019 03:04:14 -0700 Subject: [PATCH 6/8] Use true division in the doc formulas --- Doc/library/statistics.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 2410a9ee309af5..5f6d308b07cc16 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -523,17 +523,17 @@ However, for reading convenience, most of the examples show sorted sequences. in *dist* includes or excludes the lowest and highest possible values from the population. - The default *method* is *exclusive* and is used for data sampled from + The default *method* is "exclusive" and is used for data sampled from a population that can have more extreme values than found in the samples. The portion of the population falling below the *i-th* of - *m* data points is computed as ``i // (m + 1)``. + *m* data points is computed as ``i / (m + 1)``. - Setting the *method* to *inclusive* is used for describing population + Setting the *method* to "inclusive" is used for describing population data or for samples that include the extreme points. The minimum value in *dist* is treated as the 0th percentile and the maximum value is treated as the 100th percentile. The portion of the population falling below the *i-th* of *m* data points is computed as - ``(i - 1) // (m - 1)``. + ``(i - 1) / (m - 1)``. If *dist* is an instance of a class that defines an :meth:`~inv_cdf` method, setting *method* has no effect. From ab662fb923292ea34a26b8da12fe4f3bfd0138bc Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sat, 18 May 2019 03:26:10 -0700 Subject: [PATCH 7/8] Note that there should be more data points than quantiles --- Doc/library/statistics.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 5f6d308b07cc16..344af2d7d1e0b3 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -511,7 +511,8 @@ However, for reading convenience, most of the examples show sorted sequences. is not least 1. The *dist* can be any iterable containing sample data or it can be an - instance of a class that defines an :meth:`~inv_cdf` method. + instance of a class that defines an :meth:`~inv_cdf` method. For meaningful + results, the number of data points in *dist* should be larger than *n*. Raises :exc:`StatisticsError` if there are not at least two data points. For sample data, the cut points are linearly interpolated from the From d55271689407d0e19a8434c146802701208c05cd Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sat, 18 May 2019 09:49:43 -0700 Subject: [PATCH 8/8] Mention *method* specifically. --- Doc/library/statistics.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst index 344af2d7d1e0b3..bc841fda72f887 100644 --- a/Doc/library/statistics.rst +++ b/Doc/library/statistics.rst @@ -520,9 +520,9 @@ However, for reading convenience, most of the examples show sorted sequences. of the distance between two sample values, ``100`` and ``112``, the cut-point will evaluate to ``104``. - Quantiles can be computed differently depending on whether the data - in *dist* includes or excludes the lowest and highest possible values - from the population. + The *method* for computing quantiles can be varied depending on + whether the data in *dist* includes or excludes the lowest and + highest possible values from the population. The default *method* is "exclusive" and is used for data sampled from a population that can have more extreme values than found in the