covid/ingest3.py
2021-10-30 11:04:56 -07:00

252 lines
5.7 KiB
Python

#!/usr/bin/env python3
import csv
from influxdb import InfluxDBClient
from urllib.parse import urlparse
import datetime
row_fields = ["State", "Country", "Last_Update", "Confirmed", "Deaths", "Recovered", "Active", "Latitude", "Longitude"]
f2int = lambda x: int(float(x))
row_fields = {
'Hospitalization_Rate': float,
'People_Hospitalized': f2int,
'Incident_Rate': float,
'Province_State': str,
'FIPS': f2int,
'People_Tested': f2int,
'Lat': float,
'Long_': float,
'ISO3': str,
'Testing_Rate': float,
'Deaths': f2int,
'Mortality_Rate': float,
'Recovered': f2int,
'Confirmed': f2int,
'UID': f2int,
'Last_Update': None,
'Active': f2int,
'Country_Region': str,
'Total_Test_Results': f2int,
'Case_Fatality_Ratio': float,
'Cases_28_Days': f2int,
'Deaths_28_Days': f2int,
}
# https://www.nytimes.com/elections/2016/results/president
states = {
'red': {
'Georgia',
'Ohio',
'Montana',
'Pennsylvania',
'South Dakota',
'Tennessee',
'Nebraska',
'North Dakota',
'Mississippi',
'Utah',
'Missouri',
'Alaska',
'Idaho',
'Arkansas',
'Wyoming',
'Alabama',
'Indiana',
'Kentucky',
'Louisiana',
'Kansas',
'Florida',
'Iowa',
'Oklahoma',
'Texas',
'West Virginia',
'Arizona',
'South Carolina',
'Wisconsin',
'North Carolina',
'Michigan',
},
'blue': {
'Minnesota',
'New Mexico',
'Oregon',
'Nevada',
'New Jersey',
'Colorado',
'Washington',
'New Hampshire',
'District of Columbia',
'Maryland',
'Virginia',
'California',
'Hawaii',
'Massachusetts',
'New York',
'Rhode Island',
'Vermont',
'Connecticut',
'Delaware',
'Illinois',
'Maine',
},
'other': {
'American Samoa',
'Guam',
'Puerto Rico',
'Diamond Princess',
'Virgin Islands',
'Grand Princess',
'Northern Mariana Islands',
}
}
# https://upload.wikimedia.org/wikipedia/commons/4/49/ElectoralCollege2020.svg
states_2020 = {
'red': {
'Ohio',
'Montana',
'South Dakota',
'Tennessee',
'Nebraska',
'North Dakota',
'Mississippi',
'Utah',
'Alaska',
'Idaho',
'Arkansas',
'Wyoming',
'Alabama',
'Indiana',
'Kentucky',
'Louisiana',
'Kansas',
'Florida',
'Iowa',
'Oklahoma',
'Texas',
'West Virginia',
'South Carolina',
'North Carolina',
},
'blue': {
'Georgia',
'Pennsylvania',
'Missouri',
'Arizona',
'Wisconsin',
'Michigan',
'Minnesota',
'New Mexico',
'Oregon',
'Nevada',
'New Jersey',
'Colorado',
'Washington',
'New Hampshire',
'District of Columbia',
'Maryland',
'Virginia',
'California',
'Hawaii',
'Massachusetts',
'New York',
'Rhode Island',
'Vermont',
'Connecticut',
'Delaware',
'Illinois',
'Maine',
},
'other': {
'American Samoa',
'Guam',
'Puerto Rico',
'Diamond Princess',
'Virgin Islands',
'Grand Princess',
'Northern Mariana Islands',
}
}
states_bycolor = {}
for color, states in states.items():
for state in states:
states_bycolor[state] = color
states_bycolor_2020 = {}
for color, states in states_2020.items():
for state in states:
states_bycolor_2020[state] = color
def convert(func, inp):
if inp == "":
return func(0)
return func(inp)
def get_rows(fpath):
first = True
headers = None
with open(fpath, "r") as f:
r = csv.reader(f)
for line in r:
if first:
first = False
headers = line
continue
yield {headers[i]: convert(row_fields[headers[i]], line[i])
for i in range(0, len(headers))
if row_fields[headers[i]]}
def get_data_for_influx(fpath, assigned_date=None):
data = []
for row in get_rows(fpath):
if row["Province_State"] == "Recovered":
continue
data.append({
"measurement": "covid",
"tags": {
"state": row["Province_State"],
"iso3": row["ISO3"],
"color": states_bycolor[row["Province_State"]],
"color_2020": states_bycolor_2020[row["Province_State"]]
},
"time": assigned_date or row["Last_Update"],
"fields": row
})
return data
def ingest_file(influx_client, fname, assigned_date):
d = get_data_for_influx(fname, assigned_date)
# import json
# print(json.dumps(d, indent=4))
influx_client.write_points(d)
def main():
influx_uri = urlparse("http://localhost:10019/")
influx_client = InfluxDBClient(influx_uri.hostname, str(influx_uri.port)) # user, password)
influx_client.create_database("covid")
influx_client.switch_database("covid")
when = datetime.date(month=4, day=12, year=2020)
now = datetime.date.today()
while when < now:
daystring = when.strftime("%m-%d-%Y")
fname = f"COVID-19/csse_covid_19_data/csse_covid_19_daily_reports_us/{daystring}.csv"
print(fname)
ingest_file(influx_client, fname, when.strftime("%Y-%m-%dT%H:%M:%SZ"))
when = when + datetime.timedelta(days=1)
if __name__ == '__main__':
main()