Wednesday 3 July 2024

Python Equivalent of SAS Proc Transpose

 

                                                Photo by Microsoft Edge on Unsplash

As Python does not have readily available functions that is exactly identical to SAS Proc Transpose, this article provides a created function SAS_proc_transpose that exactly reproduces most of SAS Proc Transpose procedure.

 

 



import pandas as pd
import numpy as np

# =============================================================================

# INSTRUCTIONS FOR SAS_proc_transpose

 

# The 1st arg is the df.

# The 2nd arg is the BY variable(s). Enter into the list. Leave the list empty if no BY variables required.

# The 3rd arg is the ID variable(s). Enter into the list. Leave the list empty if no ID variables required.

# The 4th arg is the VAR variable(s). Enter into the list. If the list is left empty, similar to SAS, all the numeric variables will be used as the VAR variables

 

## https://support.sas.com/documentation/cdl/en/proc/61895/HTML/default/viewer.htm#a000063668.htm

# SAS_proc_transpose works similar to SAS PROC TRANSPOSE with LET option. In other words, when there are duplicate ID Values, it transposes the observation that contains the LAST occurrence of the duplicate ID value.

# SAS_proc_transpose, similar to SAS PROC TRANSPOSE, it does not transpose observations that have a missing value for one or more ID variables.

# =============================================================================

 

def SAS_proc_transpose (df_copy, by_1=[], var=[], id_1=[]):

    df=df_copy.copy() # this is to prevent the original df being modified in the function. Note that all df will be modified in function even if not returned.

   

    if not var: # if var is empty, we use all the numeric columns as the variables

        var_col = df.select_dtypes(include=['number']).columns.to_list()

    else: var_col=var

    if not id_1:

        if by_1:

            tranpos=pd.melt (df, id_vars=by_1, value_vars=var_col)

           

            by_1.append('variable')

            tranpos['g'] = tranpos.groupby(by_1, dropna=False).cumcount() + 1

           

            by_1.append('g')

            tranpos = tranpos.set_index(by_1)['value'].unstack().add_prefix('COL').reset_index()

            by_1.remove('g')

           

            tranpos.sort_values(by=by_1, na_position= 'first', ignore_index=True, kind='stable', inplace=True)

            tranpos.rename(columns={'variable': '_NAME_'}, inplace=True)

           

        elif not by_1:

             tranpos=df[var_col]

             tranpos=tranpos.T

             tranpos.columns = [ 'COL' + str(col + 1) for col in tranpos.columns]

             tranpos.index = tranpos.index.set_names('_NAME_')

             tranpos.reset_index(inplace=True)

    elif id_1:

        if by_1:

   

            #new_col=by_1 + id_1

            new_col0=by_1 + id_1

            new_col= []

            [new_col.append(x) for x in new_col0 if x not in new_col]  #Remove duplicate. In case there are same col in both by_1 & id_1

            tranpos=pd.melt (df, id_vars=new_col, value_vars=var_col)

            for col in new_col:

                #tranpos.dropna(subset = [col], inplace=True)

                tranpos[col]= tranpos[col].astype(str)

                tranpos[col]= tranpos[col].replace(['None', 'nan'] , 'nan_python')

           

            tranpos['key1']= tranpos[new_col + ['variable']].apply(lambda row: ''.join(row.values.astype(str)), axis=1)

            tranpos.drop_duplicates(subset=['key1'], keep='last', inplace=True)

           

            tranpos['key2']= tranpos[id_1].apply(lambda row: ''.join(row.values.astype(str)), axis=1)

            #tranpos.drop_duplicates(subset=['key2'], keep='last', inplace=True)

            new_col=by_1 + ['variable', 'key2']

            tranpos = tranpos.set_index(new_col)['value'].unstack().reset_index()

           

            tranpos.drop(tranpos.filter(regex = 'nan_python').columns, axis = 1, inplace = True, errors='ignore') # Drop columns with nan_python in column header. These are the id columns with nan cells that are transposed.

            tranpos.rename_axis(None, axis=1, inplace=True)

            tranpos.rename(columns={"variable": "_NAME_"}, inplace=True)

            tranpos= tranpos.replace(['nan_python'] , np.nan )

   

            tranpos.sort_values(by=by_1+['_NAME_'], axis=0, ascending=True, inplace=True, kind='stable', na_position='first', ignore_index=True)

   

        elif not by_1:

            tranpos=df.copy()

            for col in id_1:

                tranpos.dropna(subset = [col], inplace=True)

                tranpos[col]= tranpos[col].astype(str)

   

            tranpos['key']= tranpos[id_1].apply(lambda row: ''.join(row.values.astype(str)), axis=1)

            tranpos.drop_duplicates(subset=['key'], keep='last', inplace=True)

           

            new_col=var_col + ['key']

            tranpos=tranpos[new_col]

           

            #tranpos.rename(columns={'key': None}, inplace=True)

            #tranpos.set_index(None, inplace=True)

            tranpos.set_index('key', inplace=True)

            tranpos= tranpos.T

            

            tranpos.reset_index(inplace=True)

            tranpos.rename_axis(None, axis=1, inplace=True)

            tranpos.rename(columns={'index': '_NAME_'}, inplace=True)

      
    return tranpos
    

# Toy dataset to test out SAS_proc_transpose

 data = {

"name": ['dick_both', 'sam_both', 'sam_both', 'sam_both','cat_both',np.nan, 'susan_both', 'john_both', 'john_both', 'tom_left', 'tom_left', 'harry', 'test'],

"cat": ['A', 'B', 'B', 'C', 'C', 'D', 'E', 'H', np.nan, 'H', 'I', np.nan, 'E'],

"loc": ['E','W', 'W','S', 'S', 'E', 'W', 'N', 'N', 'S', 'E', 'E', 'W'],

"age": [20, 20, 19, 22, 20, 20, 20, 99, 88, 21, 22, 20, np.nan],

"ht": [np.nan, 200, 200, 200, 200, 200, 200,  200, 200,  200, 200, 200, np.nan],

"sales_l": [21, 22, 23, 24, 25, 26, 27, 28, 28, 30, 31, 32, np.nan]

}

left = pd.DataFrame(data)

df= SAS_proc_transpose (left, by_1=['name',], var=['ht','age'], id_1=['cat'])

print('PROC TRANSPOSE')

print(df)



No comments:

Post a Comment

How to Read in 1 or All Excel Sheets into a Pandas DF Using Xlwings?

                                                                   Photo by Jeff Sheldon on Unsplash One of the advantages of using Xlwings...