From 8d7e2c8144f1cfbaeeaa6899aa7b7c60893f7307 Mon Sep 17 00:00:00 2001 From: qbi Date: Thu, 3 Feb 2022 10:59:27 +0100 Subject: [PATCH] Initialer Commit --- parse_todesfaelle.py | 197 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 197 insertions(+) create mode 100644 parse_todesfaelle.py diff --git a/parse_todesfaelle.py b/parse_todesfaelle.py new file mode 100644 index 0000000..c87f6f0 --- /dev/null +++ b/parse_todesfaelle.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +''' +:author: Maximilian Golla +:contact: maximilian.golla@rub.de +:version: 0.0.5, 2022-02-01 +:description: Parses and formats RKI Todesfaelle nach Sterbedatum +:data: https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Projekte_RKI/COVID-19_Todesfaelle.xlsx +''' + +import sys +import math +from collections import OrderedDict + +STATES = { + 'BB': 2512, # Brandenburg + 'BE': 3645, # Berlin + 'BW': 11070, # Baden-Württemberg + 'BY': 13077, # Bayern + 'HB': 683, # Bremen + 'HE': 6266, # Hessen + 'HH': 1841, # Hamburg + 'MV': 1610, # Mecklenburg-Vorpommern + 'NI': 7982, # Niedersachsen + 'NW': 17933, # Nordrhein-Westfalen + 'RP': 4085, # Rheinland-Pfalz + 'SH': 2897, # Schleswig-Holstein + 'SL': 991, # Saarland + 'SN': 4078, # Sachsen + 'ST': 2208, # Sachsen-Anhalt + 'TH': 2143 # Thüringen +} + +def read_file(filename): + data = [] + with open(filename, 'r') as inputfile: + inputfile.readline() # Skip the RKI header + for line in inputfile: + line = line.rstrip('\r\n') + data.append(line) + return data + +def parse(data): + result = OrderedDict() + # Initialize the data structure with 0 + for year in ["2020", "2021", "2022"]: + result[year] = OrderedDict() + if year == "2020": # In 2020 the RKI reports 53 weeks + start = 10 + stop = 54 + if year == "2021": # In 2021 the RKI reports 52 weeks + start = 1 + stop = 53 + if year == "2022": # In 2022 the RKI reports 1 week + start = 1 + stop = 2 + for week in range(start, stop): + week = str(week) + if len(week) == 1: + week = "0" + str(week) + result[year][week] = dict() + for state in STATES: + result[year][week][state] = 0 + + # Parse the actual data + for line in data: + splitted = line.split(',') + state = splitted[0] + week = str(splitted[1]) + if len(week) == 1: + week = "0" + str(week) + year = str(splitted[2]) + # Special treatment of "<4" cases + if '<' in splitted[3]: + dead = int(splitted[3].replace('<', '')) - 1 + else: + dead = int(splitted[3]) + # Noramlize the data or skip this line for absolute values + dead = round( (dead * 100.0) / STATES[state], 2) + # Detect possible data issues in RKI data + if result[year][week][state] != 0: + sys.stderr.write("Error in RKI data: Year {} Week {} State {}\n".format(year, week, state)) + sys.exit(-1) + else: + result[year][week][state] = dead + return result + +def query(data, base, compare): + b_year = base[0:4] + b_week = base[4:6] + c_year = compare[0:4] + c_week = compare[4:6] + print("Base week: {}-{}: {}".format(b_year,b_week, data[b_year][b_week])) + print("Compare week: {}-{}: {}\n".format(c_year,c_week, data[c_year][c_week])) + + # Base must always be later than (<=) compare + if b_year > c_year: + print("Invalid query") + return + if b_year == c_year and b_week > c_week: + print("Invalid query") + return + + # There is no cool way to determine the number of weeks between 202033 and 202119 because of RKI + all_entries = [] + years = data.keys() + for year in data: + for week in data[year]: + all_entries.append(year + week) + + # Determine the index in our list which marks the base and compare week + start = 0 + end = 0 + for key, value in enumerate(all_entries): + if value == b_year+b_week: + start = key + if value == c_year+c_week: + end = key + + # Init data structures + base_value = dict() + differences = dict() + difference_percent = dict() + for state in STATES: + base_value[state] = 0 + differences[state] = 0 + difference_percent[state] = 0 + + # Sum the dead from the beginning of the pandemic to the base week, and from beginning of the pandemic to compare week + for i in range(0, end + 1): + year = all_entries[i][0:4] + week = all_entries[i][4:6] + for state in STATES: + if i <= start: + base_value[state] += data[year][week][state] + differences[state] += data[year][week][state] + + print("Beginning to Base Week: ", base_value) + print("Beginning to Compare Week:", differences, "\n") + + # Determine the change from the base week to the compare week in absolute and percent + for state in STATES: + + # Absolute + diff = round(differences[state] - base_value[state], 2) + + # Percentage + diff_percent = round( (diff * 100.0) / base_value[state], 2) + + if diff == 0: + diff = "+-" + str(diff) + elif diff > 0: + diff = "+" + str(diff) + else: + diff = str(diff) + differences[state] = diff + + if diff_percent == 0: + diff_percent = "+-" + str(diff_percent) + " %" + elif diff_percent > 0: + diff_percent = "+" + str(diff_percent) + " %" + else: + diff_percent = str(diff_percent) + " %" + difference_percent[state] = diff_percent + + print("Change (Absolute):") + print(differences) + print("Change (Percent):") + print(difference_percent) + + +def output(data): + # Print the header + header = ["Jahr", "Woche"] + for state in STATES: + header.append(state) + print("\t".join(header)) + + # Print the main data + for year in data: + for week in data[year]: + line = [str(year), str(week)] + for state in STATES: + dead = str(data[year][week][state]) + line.append(dead) + print("\t".join(line)) + +def main(): + data = read_file('COVID-19_Todesfaelle.csv') + data = parse(data) + #output(data) + query(data, "202130", "202131") + +if __name__ == '__main__': + main() +