Commit e58ebdf9 authored by David Wolfe's avatar David Wolfe
Browse files

initial analysis

parent 34b4468e
This diff is collapsed.
import pandas as pd
import numpy as np
def getData(filename):
with open(filename, 'r', encoding='iso-8859-1') as file:
data = pd.read_csv(file)
for df in data:
if 'Unnamed' in df:
data.drop(df, axis=1, inplace=True)
data = data.dropna(subset=['Date Started'])
return data
if __name__ == "__main__":
filename = 'data/RSP Project History.csv'
data = getData(filename)
import numpy as np
import pandas as pd
import scipy.stats
import load_data
import write_output_data
import datetime
import matplotlib.pyplot as plt
def get_dept_counts(data, start_date=None, end_date=None):
if start_date is not None and end_date is not None:
#get the date from the string column
dates = pd.to_datetime(data["Date Started"], infer_datetime_format=True)
# convert the endpoint dates
start_date = datetime.datetime(start_date[2], start_date[1], start_date[0], 0, 0)
end_date = datetime.datetime(end_date[2], end_date[1], end_date[0], 0, 0)
date_mask = np.logical_and(dates >= start_date, dates < end_date)
date_mask = np.ones((len(data), 0), dtype=bool)
unique_dpts = pd.unique(data["Unit"])
output = {}
units = data.get("Unit")
units = units[date_mask]
for dpt in unique_dpts:
output[dpt] = len(units[units == dpt])
return unique_dpts, output
def make_bar_plot(x, y):
sorted_inds = np.flip(np.argsort(y))
barplot =[sorted_inds], y[sorted_inds])
def get_jsdist(p, q):
p = np.copy(p)
q = np.copy(q)
p /= p.sum()
q /= q.sum()
m = (p + q)/2
jsd = (scipy.stats.entropy(p, m) + scipy.stats.entropy(q, m))/2
return jsd
def make_line_plots(xvals, dist1, dist2):
sorted_inds = np.flip(np.argsort(dist1))
x = np.linspace(0, len(xvals)-1, len(xvals))
plt.plot(xvals[sorted_inds], dist1[sorted_inds], '-o')
plt.plot(xvals[sorted_inds], dist2[sorted_inds], '-o')
plt.legend(['2018', '2019'])
if __name__ == '__main__':
filename = 'data/RSP Project History.csv'
data = load_data.getData(filename)
years = [2015, 2018, 2021]
baseline_jsd = np.zeros((len(years), ))
interval_count_dicts = {}
unique_units = np.unique(data.get("Unit"))
unique_units = np.zeros(unique_units.size, dtype=bool)
output = {}
for i in range(1, len(years)):
units, counts = get_dept_counts(data, start_date=[1, 1, years[i-1]], end_date=[1, 1, years[i]])
new_units = np.zeros(units.size, dtype=bool)
for j,u in enumerate(units):
if (not unique_units[j] and counts[u] > 0):
new_units[j] = True
interval_key = str(years[i-1]) + '-' + str(years[i])
if i == 1:
output['net-change'] = np.zeros(units.size)
net_change = [counts[units[u]] - list(interval_count_dicts.values())[-1]['counts'][u] for u in range(len(units))]
output['net-change'] = net_change
output['unit'] = units
output['counts'] = [counts[u] for u in units]
output['new-units'] = new_units
unique_units = np.logical_or(unique_units, new_units)
interval_count_dicts[interval_key] = output
write_output_data.write_data(filename, interval_count_dicts)
File added
import pandas as pd
import numpy as np
def write_data(filename, data):
writer = pd.ExcelWriter(filename)
for year, yd in data.items():
year_dict = {
'unit': list(yd['unit']),
'number of projects': list(yd['counts']),
'net change': list(yd['net-change']),
'new unit': yd['new-units']
df = pd.DataFrame.from_dict(year_dict)
df.to_excel(writer, sheet_name=str(year))
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment