Photo by Sora Sagano on Unsplash
While Python offers several merge options like pandas.merge, pandas.concat, and join; none of them can replicate the exact same output of SAS Merge.
Some differences between SAS and Python Merge…
- For SAS when merging datasets with overlapping columns, it will take the column in the right-most dataset. While for Python, overlapping columns will be suffixed to distinguish them.
- For SAS if df_left has more rows than df_right for a merge key, the non-overlapping columns from df_right for the extra rows will be filled with the data of the last row in df_right. For Python, however, it will do a Cartesian Join (Cross Join).
The below diagrams illustrates the differences.
For SAS merge, the overlapping columns age and ht take the values from df_right for Obs 1–2. For the non-overlapping column from df_right (sales_right), Obs 3 takes the same value as Obs 2.
For Python merge, the overlapping columns age and ht are suffixed with “_left” for the columns from df_left. Also, a cartesian join is performed resulting in 3 x 2 = 6 observations.
Thus I have come up with a function in Python to replicate SAS Merge.
import pandas as pd
import numpy as np
# =============================================================================
# SOME DIFFERENCES BETWEEN SAS & PYTHON MERGE
# For SAS, when merging datasets with overlapping columns, it will take the column in the right-most dataset
# For Python, overlapping columns will be suffixed to distinguish them
#
# For SAS if df_left has > rows than df_right, The non-overlapping columns from df_right, for the extra rows will be filled with the data of the last row in df_right.
# But in Python, it will do a Cartesian Join (Cross Join)
# =============================================================================
# =============================================================================
# INSTRUCTIONS FOR SAS_merge
# The 1st 2 args are the left and right dfs
# The 3rd arg is the HOW of the merge. Similar to SAS merge, the options are
# 'if df_left'
# 'if df_right'
# 'if df_left and df_right'
# '' # this is the SAS merge without the If statement
# 'if df_left NOT =df_right'
# 'if df_right NOT =df_left'
# The 4th arg is the ind. Put 'Y' if u want the variable '_merge' (this tells which df the record is from), else put 'N'
# The 5th arg is the merge key. It is of variable length. If merging on >1 variable separate the arg by ,
# =============================================================================
def SAS_merge (df_left_copy, df_right_copy, how, ind, *merge_key):
df_left=df_left_copy.copy() # this is to prevent the original df being modified in the function. Note that all df will be modified in function even if not returned.
df_right=df_right_copy.copy() # this is to prevent the original df being modified in the function. Note that all df will be modified in function even if not returned.
df_left = df_left [df_left.columns.dropna()] #Drop columns with Nan header, or else there will be errors
df_right = df_right [df_right.columns.dropna()]
df_left_subset_merge=pd.DataFrame()
df_right_subset_merge=pd.DataFrame()
df_left['KEY']=''
df_right['KEY']=''
for col in merge_key:
df_left[col] = df_left[col].fillna('')
df_right[col] = df_right[col].fillna('')
for col in merge_key:
df_left['KEY'] = df_left[['KEY', col]].apply(lambda row: ''.join(row.values.astype(str)), axis=1)
df_right['KEY'] = df_right[['KEY', col]].apply(lambda row: ''.join(row.values.astype(str)), axis=1)
if how=='if df_left' or how=='if df_right' or how=='if df_left and df_right' or how=='':
df_left.sort_values(by=list(merge_key), kind='stable', ignore_index=True, inplace=True)
df_right.sort_values(by=list(merge_key), kind='stable', ignore_index=True, inplace=True)
#check_df_left = df_left['KEY'].to_list()
#check_df_right = df_right['KEY'].to_list()
#common_key=list(set(check_df_left) & set(check_df_right)) # find the unique key that exists in both df
col_common =list(np.intersect1d(df_left.columns, df_right.columns))
col_common.remove('KEY')
for col in merge_key:
col_common.remove(col)
col_missing_right = df_right.columns.difference(df_left.columns).tolist() # find the columns in df_right that are missing from df_left
col_missing_left = df_left.columns.difference(df_right.columns).tolist() # find the columns in df_left that are missing from df_right
# find the common key in both df and their number of occurrence by inner merge
check_df_left=pd.DataFrame(df_left.value_counts(subset=['KEY'], dropna=True)) # KEY becomes the index in the df
check_df_right=pd.DataFrame(df_right.value_counts(subset=['KEY'], dropna=True))
merge_check=pd.merge(check_df_left, check_df_right, how='inner', left_index=True, right_index=True).reset_index(drop=False)
merge_left=merge_check[(merge_check['0_x']>=merge_check['0_y']) & (merge_check['0_y']>1)] #'0_x' is the num of occurrence in left df. issues occur if '0_y'>1 as Python does a cartesian merge
merge_right=merge_check[(merge_check['0_x']<=merge_check['0_y']) & (merge_check['0_x']>1)]
for c in merge_left['KEY'].to_list():
df_left_key=df_left[df_left['KEY']==c].shape[0]
df_right_key=df_right[df_right['KEY']==c].shape[0]
#row_list=merge[merge['KEY']==c].index.tolist()
df_left_subset=df_left[df_left['KEY']==c].reset_index(drop=True)
df_left=df_left[df_left['KEY']!=c]
df_right_subset=df_right[df_right['KEY']==c].reset_index(drop=True)
df_right=df_right[df_right['KEY']!=c]
# for overlapping columns fill df_left with values from df_right
#df_left_subset[list(np.intersect1d(df_left_subset.columns, df_right_subset.columns))]=df_right_subset[list(np.intersect1d(df_left_subset.columns, df_right_subset.columns))]
for col in col_common:
for i in range (0, df_right_key):
df_left_subset.iloc[i, df_left_subset.columns.get_loc(col)]=df_right_subset.iloc[i, df_right_subset.columns.get_loc(col)]
for col in col_missing_right:
df_left_subset[col]=df_right_subset[col] # for this to work, both df must have same index
# for SAS if df_left has > rows than df_right. The columns from df_right, for the extra rows will be filled with the data of the last row in df_right.
for i in range (df_right_key, df_left_key):
df_left_subset.iloc[i, df_left_subset.columns.get_loc(col)]=df_left_subset.iloc[i-1, df_left_subset.columns.get_loc(col)]
df_left_subset_merge = pd.concat([df_left_subset_merge, df_left_subset], axis = 0, ignore_index=True)
for c in merge_right['KEY'].to_list():
df_left_key=df_left[df_left['KEY']==c].shape[0]
df_right_key=df_right[df_right['KEY']==c].shape[0]
#row_list=merge[merge['KEY']==c].index.tolist()
df_left_subset=df_left[df_left['KEY']==c].reset_index(drop=True)
df_left=df_left[df_left['KEY']!=c]
df_right_subset=df_right[df_right['KEY']==c].reset_index(drop=True)
df_right=df_right[df_right['KEY']!=c]
for col in col_missing_left:
df_right_subset[col]=df_left_subset[col] # for this to work, both df must have same index
# for SAS if df_right has > rows than df_left. The columns from df_left, for the extra rows will be filled with the data of the last row in df_left.
for i in range (df_left_key, df_right_key):
df_right_subset.iloc[i, df_right_subset.columns.get_loc(col)]=df_right_subset.iloc[i-1, df_right_subset.columns.get_loc(col)]
df_right_subset_merge = pd.concat([df_right_subset_merge, df_right_subset], axis = 0, ignore_index=True)
if how=='if df_left':
merge = pd.merge(df_left, df_right, how='left', on=merge_key, suffixes=('_left', ''), sort=True, indicator=True)
col_ls=[col[:-5] for col in merge.columns if col.endswith('_left')]
for col in col_ls:
merge.loc[merge['_merge']=='left_only', col]=merge.loc[merge['_merge']=='left_only', col+'_left']
elif how=='if df_right':
merge = pd.merge(df_left, df_right, how='right', on=merge_key, suffixes=('_left', ''), sort=True, indicator=True)
elif how=='if df_left and df_right':
merge = pd.merge(df_left, df_right, how='inner', on=merge_key, suffixes=('_left', ''), sort=True, indicator=True)
elif how=='':
merge = pd.merge(df_left, df_right, how='outer', on=merge_key, suffixes=('_left', ''), sort=True, indicator=True)
col_ls=[col[:-5] for col in merge.columns if col.endswith('_left')]
for col in col_ls:
merge.loc[merge['_merge']=='left_only', col]=merge.loc[merge['_merge']=='left_only', col+'_left']
merge = pd.concat([merge, df_left_subset_merge, df_right_subset_merge], axis = 0)
merge = merge.loc[:, ~merge.columns.str.startswith('KEY')]
merge ['_merge'].fillna('both', inplace=True) # these are the rows from df_left_subset_merge, df_right_subset_merge
merge.drop(merge.filter(regex='_left$').columns, axis=1, inplace=True, errors='ignore')
if ind=='N':
merge.drop(['_merge'], axis=1, inplace=True, errors='ignore')
merge.sort_values(by=list(merge_key), kind='stable', ignore_index=True, inplace=True)
elif how=='if df_left NOT =df_right' or how=='if df_right NOT =df_left':
df_left_subset_merge = df_left[~df_left['KEY'].isin(df_right['KEY'])]
df_right_subset_merge = df_right[~df_right['KEY'].isin(df_left['KEY'])]
merge = pd.concat([df_left_subset_merge, df_right_subset_merge], axis = 0)
merge = merge.loc[:, ~merge.columns.str.startswith('KEY')]
if ind=='N':
merge.drop(['_merge'], axis=1, inplace=True, errors='ignore')
merge.sort_values(by=list(merge_key), kind='stable', ignore_index=True, inplace=True)
return merge
# Toy Dataset to Test Python Merge, Join, Concat vs SAS Merge
data = {
"name": ['dick_both', 'sam_both', 'sam_both', 'sam_both','cat_both','susan_both', 'susan_both', 'john_both', 'john_both', 'tom_left', 'tom_left'],
"age": [20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20],
"ht": [200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200],
"sales_l": [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
}
left = pd.DataFrame(data)
data = {
"name": ['dick_both', 'sam_both', 'sam_both','cat_both', 'cat_both', 'susan_both', 'john_both', 'john_both', 'john_both', 'harry_right', 'harry_right' ],
"age": [50, 50, 50, 50,50, 50, 50, 50, 50, 50, 50],
"ht": [5, 5, 5, 5,5, 5, 5, 5, 5, 5, 5],
"sales_r": [51, 52, 53, 54,55, 56, 57, 58, 59, 60, 61]
}
right = pd.DataFrame(data)
merge = pd.merge(left, right, how='right', on=['name'], suffixes=('_left', ''), sort=True)
print('PYTHON MERGE')
print(merge)
concat=pd.concat([left, right], axis=1,join='inner')
print('PYTHON CONCAT')
print(concat)
join=left.join(right, how='left', lsuffix='_left', rsuffix='')
print('PYTHON JOIN')
print(join)
merge=SAS_merge (left, right,'if df_left', 'N', 'name')
print('SAS MERGE')
print(merge)
No comments:
Post a Comment