import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import signal
In this small tutorial we will use the U.S. COVID-19 inoculation data from one of my academic papers to find the most prominent peaks and dips in daily vaccinations and visualize them with Matplotlib.
Step 1: Importing the dependencies
We will use Pandas to read and manipulate the .csv
file, Matplotlib for plotting the data, Numpy, and finally argrelextrema
function from the signal
package to find the “extreme” values in the data.
Step 2: Import and filter the data by location; we will use a CSV file from Our World in Data.
= pd.read_csv(
df_raw "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/vaccinations.csv",
=["location", "date", "daily_vaccinations"],
usecols=["date"],
parse_dates
)
= df_raw[df_raw["location"] == "United States"][["date", "daily_vaccinations"]]
df
=True, drop=True)
df.reset_index(inplace
df
date | daily_vaccinations | |
---|---|---|
0 | 2020-12-13 | NaN |
1 | 2020-12-14 | 4848.0 |
2 | 2020-12-15 | 26366.0 |
3 | 2020-12-16 | 70961.0 |
4 | 2020-12-17 | 122030.0 |
... | ... | ... |
873 | 2023-05-05 | 79390.0 |
874 | 2023-05-06 | 79498.0 |
875 | 2023-05-07 | 79119.0 |
876 | 2023-05-08 | 74120.0 |
877 | 2023-05-09 | 62309.0 |
878 rows × 2 columns
Step 3: Find peaks and dips
= df["date"].values
data_x = df["daily_vaccinations"].values
data_y
# Find peaks (np.greater)
= signal.argrelextrema(data_y, np.greater)
peak_indexes = peak_indexes[0]
peak_indexes
# Find valleys (np.less)
= signal.argrelextrema(data_y, np.less)
valley_indexes = valley_indexes[0] valley_indexes
Step 4: Plot the data
=(20, 5))
plt.figure(figsize="grey")
plt.plot(data_x, data_y, color
plt.plot(
data_x[valley_indexes],
data_y[valley_indexes],"o",
="dip",
label="r",
color
)
plt.plot(
data_x[peak_indexes],
data_y[peak_indexes],"o",
="peak",
label="g",
color
) plt.show()
Exploring the data
We can look at peak and dip dates and the corresponding values:
list(zip(data_x[peak_indexes], data_y[peak_indexes]))
[(numpy.datetime64('2020-12-19T00:00:00.000000000'), 182230.0),
(numpy.datetime64('2020-12-23T00:00:00.000000000'), 343482.0),
(numpy.datetime64('2021-01-17T00:00:00.000000000'), 1032650.0),
(numpy.datetime64('2021-02-06T00:00:00.000000000'), 1683043.0),
(numpy.datetime64('2021-02-14T00:00:00.000000000'), 1854830.0),
(numpy.datetime64('2021-03-17T00:00:00.000000000'), 2637533.0),
(numpy.datetime64('2021-04-01T00:00:00.000000000'), 3240378.0),
(numpy.datetime64('2021-04-07T00:00:00.000000000'), 3248620.0),
(numpy.datetime64('2021-04-11T00:00:00.000000000'), 3508126.0),
(numpy.datetime64('2021-05-19T00:00:00.000000000'), 1937892.0),
(numpy.datetime64('2021-06-07T00:00:00.000000000'), 1110118.0),
(numpy.datetime64('2021-06-09T00:00:00.000000000'), 1101934.0),
(numpy.datetime64('2021-07-12T00:00:00.000000000'), 501021.0),
(numpy.datetime64('2021-08-10T00:00:00.000000000'), 716144.0),
(numpy.datetime64('2021-08-21T00:00:00.000000000'), 844978.0),
(numpy.datetime64('2021-08-30T00:00:00.000000000'), 884728.0),
(numpy.datetime64('2021-09-11T00:00:00.000000000'), 688271.0),
(numpy.datetime64('2021-09-13T00:00:00.000000000'), 773381.0),
(numpy.datetime64('2021-10-01T00:00:00.000000000'), 990508.0),
(numpy.datetime64('2021-10-29T00:00:00.000000000'), 1355393.0),
(numpy.datetime64('2021-11-07T00:00:00.000000000'), 1403469.0),
(numpy.datetime64('2021-11-10T00:00:00.000000000'), 1450518.0),
(numpy.datetime64('2021-11-16T00:00:00.000000000'), 1496507.0),
(numpy.datetime64('2021-11-23T00:00:00.000000000'), 1606000.0),
(numpy.datetime64('2021-12-06T00:00:00.000000000'), 1847858.0),
(numpy.datetime64('2021-12-22T00:00:00.000000000'), 1598127.0),
(numpy.datetime64('2022-01-02T00:00:00.000000000'), 1105073.0),
(numpy.datetime64('2022-01-10T00:00:00.000000000'), 1263819.0),
(numpy.datetime64('2022-02-20T00:00:00.000000000'), 365589.0),
(numpy.datetime64('2022-03-20T00:00:00.000000000'), 185646.0),
(numpy.datetime64('2022-04-08T00:00:00.000000000'), 504605.0),
(numpy.datetime64('2022-04-14T00:00:00.000000000'), 515091.0),
(numpy.datetime64('2022-04-24T00:00:00.000000000'), 431295.0),
(numpy.datetime64('2022-04-29T00:00:00.000000000'), 395823.0),
(numpy.datetime64('2022-05-01T00:00:00.000000000'), 393300.0),
(numpy.datetime64('2022-05-15T00:00:00.000000000'), 362071.0),
(numpy.datetime64('2022-05-24T00:00:00.000000000'), 383580.0),
(numpy.datetime64('2022-06-06T00:00:00.000000000'), 306685.0),
(numpy.datetime64('2022-06-27T00:00:00.000000000'), 236437.0),
(numpy.datetime64('2022-07-19T00:00:00.000000000'), 312407.0),
(numpy.datetime64('2022-09-04T00:00:00.000000000'), 101898.0),
(numpy.datetime64('2022-09-17T00:00:00.000000000'), 499778.0),
(numpy.datetime64('2022-09-30T00:00:00.000000000'), 615134.0),
(numpy.datetime64('2022-10-14T00:00:00.000000000'), 662654.0),
(numpy.datetime64('2022-10-17T00:00:00.000000000'), 661305.0),
(numpy.datetime64('2022-10-28T00:00:00.000000000'), 657857.0),
(numpy.datetime64('2022-12-03T00:00:00.000000000'), 323924.0),
(numpy.datetime64('2022-12-06T00:00:00.000000000'), 327958.0),
(numpy.datetime64('2022-12-13T00:00:00.000000000'), 322385.0),
(numpy.datetime64('2023-01-02T00:00:00.000000000'), 176256.0),
(numpy.datetime64('2023-01-09T00:00:00.000000000'), 193697.0),
(numpy.datetime64('2023-01-23T00:00:00.000000000'), 116013.0),
(numpy.datetime64('2023-02-21T00:00:00.000000000'), 62934.0),
(numpy.datetime64('2023-02-27T00:00:00.000000000'), 59727.0),
(numpy.datetime64('2023-03-05T00:00:00.000000000'), 58888.0),
(numpy.datetime64('2023-03-26T00:00:00.000000000'), 48921.0),
(numpy.datetime64('2023-04-14T00:00:00.000000000'), 38791.0),
(numpy.datetime64('2023-04-16T00:00:00.000000000'), 39047.0),
(numpy.datetime64('2023-04-28T00:00:00.000000000'), 73366.0),
(numpy.datetime64('2023-05-06T00:00:00.000000000'), 79498.0)]
list(zip(data_x[valley_indexes], data_y[valley_indexes]))
[(numpy.datetime64('2020-12-20T00:00:00.000000000'), 171512.0),
(numpy.datetime64('2020-12-27T00:00:00.000000000'), 266832.0),
(numpy.datetime64('2021-01-18T00:00:00.000000000'), 1005887.0),
(numpy.datetime64('2021-02-07T00:00:00.000000000'), 1680143.0),
(numpy.datetime64('2021-02-20T00:00:00.000000000'), 1505355.0),
(numpy.datetime64('2021-03-20T00:00:00.000000000'), 2603838.0),
(numpy.datetime64('2021-04-04T00:00:00.000000000'), 3128761.0),
(numpy.datetime64('2021-04-08T00:00:00.000000000'), 3237451.0),
(numpy.datetime64('2021-05-14T00:00:00.000000000'), 1801903.0),
(numpy.datetime64('2021-06-03T00:00:00.000000000'), 911022.0),
(numpy.datetime64('2021-06-08T00:00:00.000000000'), 1095082.0),
(numpy.datetime64('2021-07-08T00:00:00.000000000'), 451686.0),
(numpy.datetime64('2021-07-17T00:00:00.000000000'), 482398.0),
(numpy.datetime64('2021-08-11T00:00:00.000000000'), 715034.0),
(numpy.datetime64('2021-08-22T00:00:00.000000000'), 843679.0),
(numpy.datetime64('2021-09-10T00:00:00.000000000'), 682253.0),
(numpy.datetime64('2021-09-12T00:00:00.000000000'), 686392.0),
(numpy.datetime64('2021-09-23T00:00:00.000000000'), 640587.0),
(numpy.datetime64('2021-10-21T00:00:00.000000000'), 745743.0),
(numpy.datetime64('2021-11-03T00:00:00.000000000'), 1329261.0),
(numpy.datetime64('2021-11-08T00:00:00.000000000'), 1400261.0),
(numpy.datetime64('2021-11-11T00:00:00.000000000'), 1444565.0),
(numpy.datetime64('2021-11-17T00:00:00.000000000'), 1494376.0),
(numpy.datetime64('2021-11-29T00:00:00.000000000'), 1093098.0),
(numpy.datetime64('2021-12-18T00:00:00.000000000'), 1508476.0),
(numpy.datetime64('2021-12-29T00:00:00.000000000'), 1042099.0),
(numpy.datetime64('2022-01-06T00:00:00.000000000'), 979135.0),
(numpy.datetime64('2022-02-19T00:00:00.000000000'), 365365.0),
(numpy.datetime64('2022-03-18T00:00:00.000000000'), 184913.0),
(numpy.datetime64('2022-03-28T00:00:00.000000000'), 167674.0),
(numpy.datetime64('2022-04-10T00:00:00.000000000'), 501731.0),
(numpy.datetime64('2022-04-22T00:00:00.000000000'), 420499.0),
(numpy.datetime64('2022-04-28T00:00:00.000000000'), 393721.0),
(numpy.datetime64('2022-04-30T00:00:00.000000000'), 393183.0),
(numpy.datetime64('2022-05-09T00:00:00.000000000'), 344649.0),
(numpy.datetime64('2022-05-18T00:00:00.000000000'), 359461.0),
(numpy.datetime64('2022-06-05T00:00:00.000000000'), 265346.0),
(numpy.datetime64('2022-06-21T00:00:00.000000000'), 224478.0),
(numpy.datetime64('2022-07-07T00:00:00.000000000'), 180502.0),
(numpy.datetime64('2022-09-03T00:00:00.000000000'), 101239.0),
(numpy.datetime64('2022-09-05T00:00:00.000000000'), 83600.0),
(numpy.datetime64('2022-09-18T00:00:00.000000000'), 498628.0),
(numpy.datetime64('2022-10-02T00:00:00.000000000'), 613824.0),
(numpy.datetime64('2022-10-16T00:00:00.000000000'), 658109.0),
(numpy.datetime64('2022-10-24T00:00:00.000000000'), 647301.0),
(numpy.datetime64('2022-11-29T00:00:00.000000000'), 229530.0),
(numpy.datetime64('2022-12-04T00:00:00.000000000'), 323327.0),
(numpy.datetime64('2022-12-09T00:00:00.000000000'), 316737.0),
(numpy.datetime64('2022-12-28T00:00:00.000000000'), 151812.0),
(numpy.datetime64('2023-01-05T00:00:00.000000000'), 159273.0),
(numpy.datetime64('2023-01-22T00:00:00.000000000'), 115084.0),
(numpy.datetime64('2023-02-20T00:00:00.000000000'), 62661.0),
(numpy.datetime64('2023-02-26T00:00:00.000000000'), 58351.0),
(numpy.datetime64('2023-03-04T00:00:00.000000000'), 58873.0),
(numpy.datetime64('2023-03-25T00:00:00.000000000'), 48914.0),
(numpy.datetime64('2023-04-13T00:00:00.000000000'), 38469.0),
(numpy.datetime64('2023-04-15T00:00:00.000000000'), 38520.0),
(numpy.datetime64('2023-04-19T00:00:00.000000000'), 36475.0),
(numpy.datetime64('2023-04-30T00:00:00.000000000'), 72630.0)]
However, these data don’t give us much useful information. Instead, we can look at the longest streaks of consecutive peak and dip days. In order to do that, we need to refer back to the previous figure again. Because the signal.argrelextrema
function doesn’t take into account the beginning and end of data, our first working point is a peak 6 days from the start of vaccination on December 13, 2020.
0] data_x[
numpy.datetime64('2020-12-13T00:00:00.000000000')
0] data_x[peak_indexes][
numpy.datetime64('2020-12-19T00:00:00.000000000')
Because a dip streak is basically the difference between the last peak and the next dip, in order to find the period between the two, we can subtract the date of the peak from the date of the following dip.
First, we zip
these two arrays together to be able to iterate through them at the same time. We use these two values to subtract one from another. Because the resulting dtype is Numpy’s timedelta
it is represented in nanoseconds. In order to get the day value we need to divide it by 86400000000000
which is the number of nanoseconds in 24 hours. Then we store the resulting deltas in a list; we find the max value in the list as well as its index. We can use this index value to refer back to the value pair of the period of interest – in this case it is the period of 46 days between June 19 and September 3, 2022.
# Dips
= list(
dip_strikes zip(
data_x[peak_indexes],
data_x[valley_indexes],
)
)
= []
dip_strike_days for strike in dip_strikes:
int(strike[1] - strike[0]) // 86400000000000)
dip_strike_days.append(
= max(dip_strike_days)
max_days
= dip_strike_days.index(max_days)
index_max_dip
print(dip_strikes[index_max_dip], max_days)
(numpy.datetime64('2022-07-19T00:00:00.000000000'), numpy.datetime64('2022-09-03T00:00:00.000000000')) 46
We now repeat the same procedure for the peaks. The only difference is that we now need the dip-peak values instead of the peak-dip values in the previous step. We can simply remove the first peak value and we will end up with matching dip-peak value pairs for the rest of the analysis.
# Peak
= list(zip(data_x[valley_indexes], data_x[peak_indexes][1:]))
peak_strikes
= []
peak_strikes_days for strike in peak_strikes:
int(strike[1] - strike[0]) // 86400000000000)
peak_strikes_days.append(
= max(peak_strikes_days)
max_days
= peak_strikes_days.index(max_days)
index_max_peak
print(peak_strikes[index_max_peak], max_days)
(numpy.datetime64('2021-02-20T00:00:00.000000000'), numpy.datetime64('2021-03-17T00:00:00.000000000')) 25
We end up with the value of 25 days in the period between February 20 and March 17, 2021.
Now we visualize these periods by modifying the previous plot.
= plt.subplots(figsize=(20, 5))
fig, ax ="grey") # line plot for the original data
ax.plot(data_x, data_y, color
ax.plot(
data_x[valley_indexes],
data_y[valley_indexes],"o",
="dip",
label="r",
color
)
ax.plot(
data_x[peak_indexes],
data_y[peak_indexes],"o",
="peak",
label="g",
color
)
ax.axvspan(0],
peak_strikes[index_max_peak][1],
peak_strikes[index_max_peak][="g",
color=0.4,
alpha
)
ax.axvspan(0],
dip_strikes[index_max_dip][1],
dip_strikes[index_max_dip][="r",
color=0.4,
alpha )
Without additional context (for example political or other events happening during these periods) this also doesn’t give us much useful information. However, one thing is clear – the two dip periods marked in blue below, despite being visually prominent, are not the longest dipping periods – the one highlighted in red is.