Photo by Clément Hélardot on Unsplash
import pandas as pd
import numpy as np
import math
from itertools import combinations
from statsmodels.stats.weightstats import DescrStatsW
# =============================================================================
# INSTRUCTIONS
# For stat, the options are all or a combinations of the below
# 'sum', 'sum_wt', 'count', 'min', 'max', 'mean', 'mean_wt','var', 'var_wt', np.std, 'std_wt'
# class_1 are the CLASS variables in SAS Proc Summary
# For the variables in class_1, they should be in the same order as in the SAS proc summary code. Otherwise, the '_TYPE_' will differ
# var are the VAR variables in SAS Proc Summary
# id_1 are the ID variables in SAS Proc Summary. Leave the [] empty if ID variable is not needed
# weight is the WEIGHT variable in SAS Proc Summary. Leave the [] empty if WEIGHT variable is not needed.
# nway='Y' means the same as including nway in SAS Proc Summary. nway='N' means NOT to include
# missing='Y' means the same as including missing in SAS Proc Summary. missing='N' means NOT to include
# If only want the FREQ of the class variables, leave VAR empty and put 'count' in stat
# =============================================================================
def SAS_pro_summary (df_copy, stat=[], class_1=[], var=[], id_1=[], weight=[], nway='N', missing='Y'):
df=df_copy.copy() # this is to prevent the original df being modified in the function. Note that all df will be modified in function even if not returned.
df_final_gpby=pd.DataFrame()
class_combinations = []
code_list = []
df['_FREQ_']=''
df['_TYPE_']=''
df['KEY']=''
# This is a check in case users include the same variables in both class_1 & id_1. It isn't right to append the common class variables with the id
# Find the common items
common_items = list(set(id_1) & set(class_1))
# Remove the common items from id_1
id_1 = [item for item in id_1 if item not in common_items]
if missing=='Y':
pass
elif missing=='N':
for col in class_1:
df[col].replace('', np.nan, inplace=True)
df = df.dropna(subset=class_1, how='any', axis=0) # SAS will drop the whole row as long as any class variables has nan
#for weighted sum
if weight and 'sum_wt' in stat:
df_weighted=df.copy()
for n in var:
# df_weighted.dropna(subset=[n, weight[0]], axis=0, inplace=True)
df_weighted[n]=df_weighted[n]*df_weighted[weight[0]]
# this is for '_TYPE_'
for n in range (1, len(class_1)+1):
code_list.append('CODE_'+ str(n))
for n in range(len(class_1) + 1):
class_combinations += list(combinations(class_1, n)) # Create all the different permutation of the class variables
class_combinations.pop(0) # remove the empty permutation
id_1.append('KEY')
for s in stat:
df_TOTAL=pd.DataFrame()
df_total_gpby=pd.DataFrame()
df_unique=pd.DataFrame()
var_dict={'_FREQ_': 'count'}
df_gpby_variance_merged=pd.DataFrame()
df_gpby_variance_merged['KEY']=''
for n in var:
if s=='sum_wt' or s=='sum':
var_dict[n]=lambda x: np.nan if x.isna().all() else x.sum() # sum will return 0 even if all summed items are np.nan. So we use this function to return nan
#var_dict[n]='sum'
elif s=='mean_wt':
#mean_wt_fn = lambda x: np.ma.average(x.dropna(), weights=df.loc[x.dropna().index, weight[0]]) # function to compute weighted mean and dropna
mean_wt_fn = lambda x: np.ma.average(x, weights=df.loc[x.index, weight[0]])
var_dict[n]=mean_wt_fn
elif s=='var_wt' or s=='std_wt':
pass
else:
var_dict[n]=s
# this is to create the stat based on the entire df
if s=='sum_wt':
df_TOTAL=pd.DataFrame(df_weighted.agg(var_dict))
elif s=='var_wt' or s=='std_wt' or s=='mean_wt':
df_TOTAL=pd.DataFrame(df.agg(var_dict))
for n in var:
df_copy=df.copy()
df_copy.dropna(subset=[n, weight[0]], how='any', axis=0, inplace=True) # Need to drop rows with nan as the function will return nan even if there is some valid values
if s=='var_wt':
df_TOTAL.loc[n]=(DescrStatsW(df_copy[n], weights=df_copy[weight[0]], ddof=10).var)*10
elif s=='std_wt':
df_TOTAL.loc[n]=math.sqrt((DescrStatsW(df_copy[n], weights=df_copy[weight[0]], ddof=10).var)*10)
elif s=='mean_wt':
mean_wt_fn_subset = lambda x: np.ma.average(x, weights=df_copy.loc[x.index, weight[0]])
var_val=df_copy[n].agg({n: mean_wt_fn_subset })
df_TOTAL.loc[n]=var_val[0]
else:
df_TOTAL=pd.DataFrame(df.agg(var_dict))
df_TOTAL = df_TOTAL.T
df_TOTAL['_TYPE_']=0
if s!=np.std:
df_TOTAL['_STAT_']=s
elif s==np.std:
df_TOTAL['_STAT_']='std'
df.sort_values(by=id_1, kind='stable', ignore_index=True, inplace=True, na_position='first')
for n in id_1:
df_TOTAL[n]=df[n].iloc[-1]
var_dict['KEY']='last'
for n in class_combinations:
# this is for '_TYPE_'. For the class variables present in this iteration, assign 1 to the corresponding code variable
code_dict = {}
for i, k in zip(class_1, code_list):
for o in n:
if o==i:
code_dict[k]='1'
break
elif o!=i:
code_dict[k]='0'
code_full=''.join(code_dict.values()) # concat all the code variables into a single string
sort_tup=n+tuple(id_1)
df['KEY'] =''
df.sort_values(by=list(sort_tup), kind='stable', ignore_index=True, inplace=True, na_position='first') # this is to ensure the class variables and id are sorted correctly,
if s=='sum_wt':
df_weighted['KEY'] =''
df_weighted.sort_values(by=list(sort_tup), kind='stable', ignore_index=True, inplace=True, na_position='first')
for o in n: # For the class variables present in this iteration, create a key
df_weighted[o] = df_weighted[o].fillna('') # if na is not filled with blank, the key will be blank as long as any column has nan
df_weighted['KEY'] = df_weighted[['KEY', o]].apply(lambda row: ''.join(row.values.astype(str)), axis=1)
df_weighted[o].replace(r'^\s*$', np.nan, regex=True, inplace=True) # replace the empty space (created in the 1st step) with nan again.
for o in n: # For the class variables present in this iteration, create a key
df[o] = df[o].fillna('') # if na is not filled with blank, the key will be blank as long as any column has nan
df['KEY'] = df[['KEY', o]].apply(lambda row: ''.join(row.values.astype(str)), axis=1)
df[o].replace(r'^\s*$', np.nan, regex=True, inplace=True) # replace the empty space (created in the 1st step) with nan again.
df_unique = df.drop_duplicates(subset=['KEY'], keep='last') # Extract only the last to merge in the id variables
if missing=='Y':
if s=='sum_wt':
df_gpby = df_weighted.groupby(list(n), dropna=False).agg(var_dict).reset_index()
elif s=='mean_wt':
# find groups that has >1 element
non_unique=df['KEY'].value_counts()
non_unique=non_unique.where(non_unique>1).dropna()
non_unique_list=non_unique.index.tolist()
df_gpby=df.groupby(list(n), dropna=False).agg({'KEY':'last', '_FREQ_':'count'}).reset_index()
for k in var:
# Need to drop those rows that belong to groups with > 1 element and has nan. Otherwise, entire group will have nan even if some valid values
#df_subset=pd.concat([df[(df['KEY'].isin(non_unique_list)) & (df[[weight[0],k]].notnull().all(axis=1))], df[~df['KEY'].isin(non_unique_list)]], axis=0)
df_subset=df[~((df['KEY'].isin(non_unique_list)) & (df[[weight[0],k]].isnull().any(axis=1)))]
mean_wt_fn_subset = lambda x: np.ma.average(x, weights=df_subset.loc[x.index, weight[0]])
df_gpby_subset= df_subset.groupby(list(n), dropna=False).agg({k: mean_wt_fn, 'KEY':'last'}).reset_index()
#df_gpby=pd.concat([df_gpby, df_gpby_subset[df_gpby_subset.columns.difference(df_gpby.columns)]], axis=1)
df_gpby=pd.merge(df_gpby, df_gpby_subset[[k, 'KEY']], how='outer', on='KEY', suffixes=('_x', ''))
#print(df_gpby)
elif s=='var_wt' or s=='std_wt':
df_gpby = df.groupby(list(n), dropna=False).agg(var_dict).reset_index()
for k in var:
variance_fn =lambda x: (DescrStatsW(x[k], weights=x[weight[0]], ddof=10).var)*10
df_gpby_variance= df.dropna(subset=[k, weight[0]], how='any', axis=0).groupby(list(n), dropna=False).apply(variance_fn).reset_index()
df_gpby_variance.rename(columns={0: k}, inplace=True)
if s=='std_wt':
df_gpby_variance[k]= np.sqrt(df_gpby_variance[k])
df_gpby_variance['KEY']=''
for z in n: # For the class variables present in this iteration, create a key
df_gpby_variance[z] = df_gpby_variance[z].fillna('') # if na is not filled with blank, the key will be blank as long as any column has nan
df_gpby_variance['KEY'] = df_gpby_variance[['KEY', z]].apply(lambda row: ''.join(row.values.astype(str)), axis=1)
df_gpby_variance[z].replace(r'^\s*$', np.nan, regex=True, inplace=True) # replace the empty space (created in the 1st step) with nan again.
df_gpby_variance_merged=pd.merge(df_gpby_variance_merged, df_gpby_variance[[k, 'KEY']], how='outer', on='KEY', suffixes=('_x', ''))
df_gpby = pd.merge(df_gpby, df_gpby_variance_merged[var + ['KEY']], how='left', on='KEY', suffixes=('_x', ''))
else:
df_gpby = df.groupby(list(n), dropna=False).agg(var_dict).reset_index()
elif missing=='N':
if s=='sum_wt':
df_gpby = df_weighted.groupby(list(n), dropna=True).agg(var_dict).reset_index()
elif s=='mean_wt':
# find groups that has >1 element
non_unique=df['KEY'].value_counts()
non_unique=non_unique.where(non_unique>1).dropna()
non_unique_list=non_unique.index.tolist()
df_gpby=df.groupby(list(n), dropna=True).agg({'KEY':'last', '_FREQ_':'count'}).reset_index()
for k in var:
# Need to drop those rows that belong to groups with > 1 element and has nan. Otherwise, entire group will have nan even if some valid values
#df_subset=pd.concat([df[(df['KEY'].isin(non_unique_list)) & (df[[weight[0],k]].notnull().all(axis=1))], df[~df['KEY'].isin(non_unique_list)]], axis=0)
df_subset=df[~((df['KEY'].isin(non_unique_list)) & (df[[weight[0],k]].isnull().any(axis=1)))]
mean_wt_fn_subset = lambda x: np.ma.average(x, weights=df_subset.loc[x.index, weight[0]])
df_gpby_subset= df_subset.groupby(list(n), dropna=True).agg({k: mean_wt_fn, 'KEY':'last'}).reset_index()
#df_gpby=pd.concat([df_gpby, df_gpby_subset[df_gpby_subset.columns.difference(df_gpby.columns)]], axis=1)
df_gpby=pd.merge(df_gpby, df_gpby_subset[[k, 'KEY']], how='outer', on='KEY', suffixes=('_x', ''))
elif s=='var_wt' or s=='std_wt':
df_gpby = df.groupby(list(n), dropna=True).agg(var_dict).reset_index()
for k in var:
variance_fn =lambda x: (DescrStatsW(x[k], weights=x[weight[0]], ddof=10).var)*10
df_gpby_variance= df.dropna(subset=[k, weight[0]], how='any', axis=0).groupby(list(n), dropna=True).apply(variance_fn).reset_index()
df_gpby_variance.rename(columns={0: k}, inplace=True)
if s=='std_wt':
df_gpby_variance[k]= np.sqrt(df_gpby_variance[k])
df_gpby_variance['KEY']=''
for z in n: # For the class variables present in this iteration, create a key
df_gpby_variance[z] = df_gpby_variance[z].fillna('') # if na is not filled with blank, the key will be blank as long as any column has nan
df_gpby_variance['KEY'] = df_gpby_variance[['KEY', z]].apply(lambda row: ''.join(row.values.astype(str)), axis=1)
df_gpby_variance[z].replace(r'^\s*$', np.nan, regex=True, inplace=True) # replace the empty space (created in the 1st step) with nan again.
df_gpby_variance_merged=pd.merge(df_gpby_variance_merged, df_gpby_variance[[k, 'KEY']], how='outer', on='KEY', suffixes=('_x', ''))
df_gpby = pd.merge(df_gpby, df_gpby_variance_merged[var + ['KEY']], how='left', on='KEY', suffixes=('_x', ''))
else:
df_gpby = df.groupby(list(n), dropna=True).agg(var_dict).reset_index()
df_gpby['_TYPE_']=code_full
df_gpby = pd.merge(df_gpby, df_unique[id_1], how='left', on='KEY', suffixes=('_x', ''))
df_total_gpby= pd.concat([df_total_gpby, df_gpby], axis = 0, ignore_index=True)
df_total_gpby.drop(df_total_gpby.filter(regex='_x$').columns, axis=1, inplace=True, errors='ignore')
df_total_gpby.drop(columns=['KEY'], axis=1, inplace=True, errors='ignore')
df_total_gpby['_TYPE_']=pd.to_numeric(df_total_gpby['_TYPE_'], errors='coerce')
df_total_gpby.sort_values(by=['_TYPE_'], kind='stable', ignore_index=True, inplace=True, na_position='first')
df_total_gpby['_TYPE_']=pd.factorize(df_total_gpby['_TYPE_'])[0] + 1 # factorise starts from 0, so need to +1
df_total_gpby = pd.concat([df_TOTAL, df_total_gpby], axis = 0, ignore_index=True)
if nway=='Y':
df_total_gpby=df_total_gpby.nlargest(1, columns=['_TYPE_'], keep='all')
if s!=np.std:
df_total_gpby['_STAT_']=s
elif s==np.std:
df_total_gpby['_STAT_']='std'
df_final_gpby = pd.concat([df_final_gpby, df_total_gpby], axis = 0, ignore_index=True)
df_final_gpby.drop(columns=['KEY'], axis=1, inplace=True, errors='ignore')
sort_list=['_TYPE_'] + list(reversed(class_1)) + ['_STAT_']
df_final_gpby.sort_values(by=sort_list, kind='stable', ignore_index=True, inplace=True, na_position='first')
#print(df_final_gpby)
return df_final_gpby
# TestSAS_pro_summary with the below toy dataset
data = {
"name": ['dick', 'sam', 'sam', 'sam','susan', 'susan', 'john', 'john', 'john', 'john',np.nan],
"last_name": ['lim', 'tan', 'tan', 'lim', 'wee', 'lim', 'low', 'tan', 'tan', 'lee' , 'tan'],
"id": ['S','F', 'S','F', 'F', 'S', 'S','F', 'F','F', 'F'],
"loc": ['E','W', 'N','S', 'N', 'S', 'E','N', 'N','N', 'N'],
"sex": ['M','M', 'F','F', 'F','F', 'M','F', 'F', 'F', 'F'],
"age": [np.nan,20, 20,20, 20, 789, 234,9999, 10, 10, np.nan],
"age2":[np.nan, -123, 20,20, 20, 789, 234,9999, np.nan, np.nan, np.nan],
'wt': [90, 10, 10, 10, 10, 10, 10, 10, 10, np.nan, np.nan]
}
df = pd.DataFrame(data)
df=SAS_pro_summary(df, stat=['count'], class_1=[ 'name'], var=[], id_1=['sex', 'last_name'], weight=['wt'], nway='N', missing='N')
print(df)
No comments:
Post a Comment