Data World of the Red-haired Girl: Python Equivalent of SAS Proc Summary

Photo by Clément Hélardot on Unsplash
While Python is a power language, it may not have ready-to-use functions that replicates SAS procedure exactly. This article provides a created function SAS_pro_summary that exactly replicates most of SAS Proc Summary procedure.

import pandas as pd
import numpy as np
import math
from itertools import combinations
from statsmodels.stats.weightstats import DescrStatsW



# =============================================================================
# INSTRUCTIONS

# For stat, the options are all or a combinations of the below
# 'sum', 'sum_wt', 'count', 'min', 'max', 'mean', 'mean_wt','var', 'var_wt', np.std, 'std_wt'

# class_1 are the CLASS variables in SAS Proc Summary 
# For the variables in class_1, they should be in the same order as in the SAS proc summary code. Otherwise, the '_TYPE_' will differ

# var are the VAR variables in SAS Proc Summary 

# id_1 are the ID variables in SAS Proc Summary. Leave the [] empty if ID variable is not needed

# weight is the WEIGHT variable in SAS Proc Summary. Leave the [] empty if WEIGHT variable is not needed. 

# nway='Y' means the same as including nway in SAS Proc Summary. nway='N' means NOT to include

# missing='Y' means the same as including missing in SAS Proc Summary. missing='N' means NOT to include

# If only want the FREQ of the class variables, leave VAR empty and put 'count' in stat

# =============================================================================


def SAS_pro_summary (df_copy, stat=[], class_1=[], var=[], id_1=[], weight=[], nway='N', missing='Y'):
    df=df_copy.copy() # this is to prevent the original df being modified in the function. Note that all df will be modified in function even if not returned.
    
    df_final_gpby=pd.DataFrame()
    class_combinations = []
    code_list = []
    
    df['_FREQ_']=''
    df['_TYPE_']=''
    df['KEY']=''
    
    # This is a check in case users include the same variables in both class_1 & id_1. It isn't right to append the common class variables with the id
    # Find the common items
    common_items = list(set(id_1) & set(class_1))

    # Remove the common items from id_1
    id_1 = [item for item in id_1 if item not in common_items]
    
    if missing=='Y':
        pass
    elif missing=='N':
        for col in class_1:
            df[col].replace('', np.nan, inplace=True)
        df = df.dropna(subset=class_1, how='any', axis=0) # SAS will drop the whole row as long as any class variables has nan
   
    #for weighted sum
    if weight and 'sum_wt' in stat:
        df_weighted=df.copy()
        for n in var:
        #   df_weighted.dropna(subset=[n, weight[0]], axis=0, inplace=True)
            df_weighted[n]=df_weighted[n]*df_weighted[weight[0]]
          
             
    # this is for '_TYPE_'
    for n in range (1, len(class_1)+1):
        code_list.append('CODE_'+ str(n))
        
        
    for n in range(len(class_1) + 1):
      class_combinations += list(combinations(class_1, n)) # Create all the different permutation of the class variables
    class_combinations.pop(0) # remove  the empty permutation
   
    id_1.append('KEY') 
    for s in stat:
        df_TOTAL=pd.DataFrame()
        df_total_gpby=pd.DataFrame()
        df_unique=pd.DataFrame()
        var_dict={'_FREQ_': 'count'}
        
        df_gpby_variance_merged=pd.DataFrame()
        df_gpby_variance_merged['KEY']=''
        
        
        for n in var:
            if s=='sum_wt' or s=='sum':
                var_dict[n]=lambda x: np.nan if x.isna().all() else x.sum() # sum will return 0 even if all summed items are np.nan. So we use this function to return nan
                #var_dict[n]='sum'
            elif s=='mean_wt':
                #mean_wt_fn = lambda x: np.ma.average(x.dropna(), weights=df.loc[x.dropna().index, weight[0]]) # function to compute weighted mean and dropna
                mean_wt_fn = lambda x: np.ma.average(x, weights=df.loc[x.index, weight[0]]) 
                var_dict[n]=mean_wt_fn
            elif s=='var_wt' or s=='std_wt':
               pass
            else:
                var_dict[n]=s
                   
        # this is to create the stat based on the entire df
        if s=='sum_wt':
            df_TOTAL=pd.DataFrame(df_weighted.agg(var_dict))
            

        elif s=='var_wt' or s=='std_wt' or s=='mean_wt':   
            df_TOTAL=pd.DataFrame(df.agg(var_dict))
            for n in var:
                df_copy=df.copy()
                df_copy.dropna(subset=[n, weight[0]], how='any', axis=0, inplace=True) # Need to drop rows with nan as the function will return nan even if there is some valid values
    
                if s=='var_wt':
                    df_TOTAL.loc[n]=(DescrStatsW(df_copy[n], weights=df_copy[weight[0]], ddof=10).var)*10
                elif s=='std_wt':
                    df_TOTAL.loc[n]=math.sqrt((DescrStatsW(df_copy[n], weights=df_copy[weight[0]], ddof=10).var)*10)
                elif s=='mean_wt':
                    mean_wt_fn_subset = lambda x: np.ma.average(x, weights=df_copy.loc[x.index, weight[0]]) 
                    var_val=df_copy[n].agg({n: mean_wt_fn_subset })
                    df_TOTAL.loc[n]=var_val[0]
                             
        else:
            df_TOTAL=pd.DataFrame(df.agg(var_dict))
            
            
        df_TOTAL = df_TOTAL.T
        df_TOTAL['_TYPE_']=0
        if s!=np.std:
            df_TOTAL['_STAT_']=s
        elif s==np.std:
            df_TOTAL['_STAT_']='std'
        df.sort_values(by=id_1, kind='stable', ignore_index=True, inplace=True, na_position='first')
        for n in id_1:
            df_TOTAL[n]=df[n].iloc[-1]
      
        
        
        var_dict['KEY']='last' 
        for n in class_combinations:
            
            # this is for '_TYPE_'. For the class variables present in this iteration, assign 1 to the corresponding code variable
            code_dict = {}
            for i, k  in zip(class_1,  code_list):
                for o in n:
                    if o==i:
                        code_dict[k]='1'
                        break
                    elif  o!=i:
                        code_dict[k]='0'
                        
            code_full=''.join(code_dict.values())   # concat all the code variables into a single string
            
            
            sort_tup=n+tuple(id_1)
            df['KEY'] =''
            df.sort_values(by=list(sort_tup), kind='stable', ignore_index=True, inplace=True, na_position='first') # this is to ensure the class variables and id are sorted correctly, 
            
            if s=='sum_wt':
                df_weighted['KEY'] =''
                df_weighted.sort_values(by=list(sort_tup), kind='stable', ignore_index=True, inplace=True, na_position='first') 
                for o in n: # For the class variables present in this iteration, create a key
                    df_weighted[o] =  df_weighted[o].fillna('') # if na is not filled with blank, the key will be blank as long as any column has nan
                    df_weighted['KEY'] = df_weighted[['KEY', o]].apply(lambda row: ''.join(row.values.astype(str)), axis=1)
                    df_weighted[o].replace(r'^\s*$', np.nan, regex=True, inplace=True) # replace the empty space (created in the 1st step) with nan again. 
                    
                    
            for o in n: # For the class variables present in this iteration, create a key
                df[o] = df[o].fillna('') # if na is not filled with blank, the key will be blank as long as any column has nan
                df['KEY'] = df[['KEY', o]].apply(lambda row: ''.join(row.values.astype(str)), axis=1)
                df[o].replace(r'^\s*$', np.nan, regex=True, inplace=True) # replace the empty space (created in the 1st step) with nan again. 
            df_unique = df.drop_duplicates(subset=['KEY'], keep='last') # Extract only the last to merge in the id variables
            
            if missing=='Y':
                if s=='sum_wt':
                    df_gpby = df_weighted.groupby(list(n), dropna=False).agg(var_dict).reset_index()
                
                elif s=='mean_wt':
                    
                    # find groups that has >1 element
                    non_unique=df['KEY'].value_counts() 
                    non_unique=non_unique.where(non_unique>1).dropna()
                    non_unique_list=non_unique.index.tolist()
                
                    df_gpby=df.groupby(list(n), dropna=False).agg({'KEY':'last', '_FREQ_':'count'}).reset_index()
                   
                    for k in var:
                        # Need to drop those rows that belong to groups with > 1 element and has nan. Otherwise, entire group will have nan even if some valid values
                        #df_subset=pd.concat([df[(df['KEY'].isin(non_unique_list)) & (df[[weight[0],k]].notnull().all(axis=1))], df[~df['KEY'].isin(non_unique_list)]], axis=0) 
                        df_subset=df[~((df['KEY'].isin(non_unique_list)) & (df[[weight[0],k]].isnull().any(axis=1)))]
                        mean_wt_fn_subset = lambda x: np.ma.average(x, weights=df_subset.loc[x.index, weight[0]]) 
                        df_gpby_subset= df_subset.groupby(list(n), dropna=False).agg({k: mean_wt_fn, 'KEY':'last'}).reset_index()
                        #df_gpby=pd.concat([df_gpby, df_gpby_subset[df_gpby_subset.columns.difference(df_gpby.columns)]], axis=1)
                        df_gpby=pd.merge(df_gpby, df_gpby_subset[[k, 'KEY']], how='outer', on='KEY', suffixes=('_x', ''))
                        #print(df_gpby)
                        
                elif s=='var_wt' or s=='std_wt':   
                    df_gpby = df.groupby(list(n), dropna=False).agg(var_dict).reset_index()
                    
                    for k in var:
                        variance_fn =lambda x: (DescrStatsW(x[k], weights=x[weight[0]], ddof=10).var)*10  
                        df_gpby_variance= df.dropna(subset=[k, weight[0]], how='any', axis=0).groupby(list(n), dropna=False).apply(variance_fn).reset_index()
                        df_gpby_variance.rename(columns={0: k}, inplace=True)
                        
                        if s=='std_wt':
                            df_gpby_variance[k]= np.sqrt(df_gpby_variance[k])
                            
                        df_gpby_variance['KEY']=''
                        for z in n: # For the class variables present in this iteration, create a key
                            df_gpby_variance[z] = df_gpby_variance[z].fillna('') # if na is not filled with blank, the key will be blank as long as any column has nan
                            df_gpby_variance['KEY'] = df_gpby_variance[['KEY', z]].apply(lambda row: ''.join(row.values.astype(str)), axis=1)
                            df_gpby_variance[z].replace(r'^\s*$', np.nan, regex=True, inplace=True) # replace the empty space (created in the 1st step) with nan again. 
                               
                               
                        df_gpby_variance_merged=pd.merge(df_gpby_variance_merged, df_gpby_variance[[k, 'KEY']], how='outer', on='KEY', suffixes=('_x', ''))
                       
                    df_gpby = pd.merge(df_gpby, df_gpby_variance_merged[var + ['KEY']], how='left', on='KEY', suffixes=('_x', ''))
                    
                    
                else:
                    df_gpby = df.groupby(list(n), dropna=False).agg(var_dict).reset_index()
                     
            elif missing=='N':
                if s=='sum_wt':
                    df_gpby = df_weighted.groupby(list(n), dropna=True).agg(var_dict).reset_index()
                
                elif s=='mean_wt':
                    
                    # find groups that has >1 element
                    non_unique=df['KEY'].value_counts() 
                    non_unique=non_unique.where(non_unique>1).dropna()
                    non_unique_list=non_unique.index.tolist()
                     
                    df_gpby=df.groupby(list(n), dropna=True).agg({'KEY':'last', '_FREQ_':'count'}).reset_index()
               
                    for k in var:
                        # Need to drop those rows that belong to groups with > 1 element and has nan. Otherwise, entire group will have nan even if some valid values
                        #df_subset=pd.concat([df[(df['KEY'].isin(non_unique_list)) & (df[[weight[0],k]].notnull().all(axis=1))], df[~df['KEY'].isin(non_unique_list)]], axis=0) 
                        df_subset=df[~((df['KEY'].isin(non_unique_list)) & (df[[weight[0],k]].isnull().any(axis=1)))]
                        mean_wt_fn_subset = lambda x: np.ma.average(x, weights=df_subset.loc[x.index, weight[0]]) 
                        df_gpby_subset= df_subset.groupby(list(n), dropna=True).agg({k: mean_wt_fn, 'KEY':'last'}).reset_index()
                        #df_gpby=pd.concat([df_gpby, df_gpby_subset[df_gpby_subset.columns.difference(df_gpby.columns)]], axis=1)
                        df_gpby=pd.merge(df_gpby, df_gpby_subset[[k, 'KEY']], how='outer', on='KEY', suffixes=('_x', ''))
                      
                
                elif s=='var_wt' or s=='std_wt':   
                    df_gpby = df.groupby(list(n), dropna=True).agg(var_dict).reset_index()
                    
                    for k in var:
                        variance_fn =lambda x: (DescrStatsW(x[k], weights=x[weight[0]], ddof=10).var)*10  
                        df_gpby_variance= df.dropna(subset=[k, weight[0]], how='any', axis=0).groupby(list(n), dropna=True).apply(variance_fn).reset_index()
                        df_gpby_variance.rename(columns={0: k}, inplace=True)
                        
                        if s=='std_wt':
                            df_gpby_variance[k]= np.sqrt(df_gpby_variance[k])
                            
                        df_gpby_variance['KEY']=''
                        for z in n: # For the class variables present in this iteration, create a key
                            df_gpby_variance[z] = df_gpby_variance[z].fillna('') # if na is not filled with blank, the key will be blank as long as any column has nan
                            df_gpby_variance['KEY'] = df_gpby_variance[['KEY', z]].apply(lambda row: ''.join(row.values.astype(str)), axis=1)
                            df_gpby_variance[z].replace(r'^\s*$', np.nan, regex=True, inplace=True) # replace the empty space (created in the 1st step) with nan again. 
                               
                               
                        df_gpby_variance_merged=pd.merge(df_gpby_variance_merged, df_gpby_variance[[k, 'KEY']], how='outer', on='KEY', suffixes=('_x', ''))
                    
                    df_gpby = pd.merge(df_gpby, df_gpby_variance_merged[var + ['KEY']], how='left', on='KEY', suffixes=('_x', ''))
                    
                    
                else:
                    df_gpby = df.groupby(list(n), dropna=True).agg(var_dict).reset_index()
             
            df_gpby['_TYPE_']=code_full
            df_gpby = pd.merge(df_gpby, df_unique[id_1], how='left', on='KEY', suffixes=('_x', ''))
            
          
            df_total_gpby= pd.concat([df_total_gpby, df_gpby], axis = 0, ignore_index=True) 
        
        
        df_total_gpby.drop(df_total_gpby.filter(regex='_x$').columns, axis=1, inplace=True, errors='ignore')
        df_total_gpby.drop(columns=['KEY'], axis=1, inplace=True, errors='ignore')
    
        df_total_gpby['_TYPE_']=pd.to_numeric(df_total_gpby['_TYPE_'], errors='coerce')
        df_total_gpby.sort_values(by=['_TYPE_'], kind='stable', ignore_index=True, inplace=True, na_position='first')
        df_total_gpby['_TYPE_']=pd.factorize(df_total_gpby['_TYPE_'])[0] + 1 # factorise starts from 0, so need to +1
        
        df_total_gpby = pd.concat([df_TOTAL, df_total_gpby], axis = 0, ignore_index=True) 
        
        if  nway=='Y':
            df_total_gpby=df_total_gpby.nlargest(1, columns=['_TYPE_'], keep='all')
       
        if s!=np.std:
            df_total_gpby['_STAT_']=s
        elif s==np.std:
            df_total_gpby['_STAT_']='std'
            
        df_final_gpby = pd.concat([df_final_gpby, df_total_gpby], axis = 0, ignore_index=True)    
 
    df_final_gpby.drop(columns=['KEY'], axis=1, inplace=True, errors='ignore')
    sort_list=['_TYPE_'] + list(reversed(class_1)) + ['_STAT_']
    df_final_gpby.sort_values(by=sort_list, kind='stable', ignore_index=True, inplace=True, na_position='first')
    #print(df_final_gpby)

    return df_final_gpby


# TestSAS_pro_summary with the below toy dataset

data = {
        
"name": ['dick', 'sam', 'sam', 'sam','susan', 'susan', 'john', 'john', 'john', 'john',np.nan],
"last_name": ['lim', 'tan', 'tan', 'lim', 'wee', 'lim', 'low', 'tan',  'tan', 'lee' , 'tan'],
"id": ['S','F', 'S','F', 'F', 'S', 'S','F', 'F','F', 'F'],
"loc": ['E','W', 'N','S', 'N', 'S', 'E','N', 'N','N', 'N'],
"sex": ['M','M', 'F','F', 'F','F', 'M','F', 'F', 'F', 'F'],
"age": [np.nan,20, 20,20, 20, 789, 234,9999, 10, 10, np.nan],
"age2":[np.nan, -123, 20,20, 20, 789, 234,9999,  np.nan, np.nan, np.nan],
'wt': [90, 10, 10, 10, 10, 10, 10, 10, 10, np.nan,  np.nan]
}


df = pd.DataFrame(data)



df=SAS_pro_summary(df, stat=['count'], class_1=[ 'name'], var=[], id_1=['sex', 'last_name'], weight=['wt'], nway='N', missing='N')
print(df)
Data World of the Red-haired Girl

Thursday, 20 June 2024

Python Equivalent of SAS Proc Summary

No comments:

Post a Comment

How to Read in 1 or All Excel Sheets into a Pandas DF Using Xlwings?