Photo by Jeff Sheldon on Unsplash
One of the advantages of using Xlwings to read in Excel data into Pandas dataframe is that numeric columns that should be stored as string (ie. leading zeros are retained) will be object Dtype. Pandas.read_excel() will, instead, convert such columns into int64 Dtype (ie. leading zeros are removed).
I have created a function that can read in a single or all sheets of an Excel into a pandas df.
import pandas as pd
import os
import xlwings as xw
# =============================================================================
# INSTRUCTIONS FOR read_pw_excel
# Function can also read in CSV files. But as the underlying software is Excel, if the CSV file exceeds the maximum number of rows allowed in Excel, only the 1st row up to the limit will be read in.
# 1) Leave sheet blank to read in all sheets in the Excel file.
# Output will be a vertically stacked df of all sheets. Otherwise, specifiy the exact sheet to read in for df.
# If reading in all sheets,
# Code will read in the 1st row of the 1st sheet as the header.
# Whether subsequent sheets have header row or do not have header row (ie all rows are data), code will use the 1st row of the 1st sheet as the header.
# Code will drop columns with None as header (including the index column if it doesn't have header)
# Code will convert all header to upper case
# 2) Leave excel_range blank to read in the used range in the sheet(s). Otherwise specify the exact range like 'A:B'.
# 3) Leave pw blank if Excel file is not password protected
# =============================================================================
def read_pw_excel(link, sheet='', excel_range = '', pw=''):
if not os.path.exists(link):
print("link does not exist. Please try again.")
raise Exception("LinkNotFoundError")
df_merged=pd.DataFrame()
col_names_first=''
col_names_next=''
lt_type=''
index_col=''
app = xw.App()
app.display_alerts=False
if pw=='':
#filebook = xw.Book(link)
filebook = app.books.open(link, update_links=False)
else: filebook = app.books.open(link, update_links=False, password=pw)
sht_ct = filebook.sheets.count
for ct in range(0, sht_ct):
if sheet=='':
if excel_range == '':
range_to_use = filebook.sheets[ct].used_range
else: range_to_use = excel_range
data = filebook.sheets[ct].range(range_to_use).value # Note this will be a list Eg [['Name', 'Age'], ['Tom', 4], ['Harry', 3]]. If there is only 1 column or 1 row, it will be ['Name', 'Tom', 'Harry'] or [ 'Tom', 'Tan', 6]
else:
if excel_range == '':
range_to_use = filebook.sheets[sheet].used_range
else: range_to_use = excel_range
data=filebook.sheets[sheet].range(range_to_use).value
if ct==0:
# if there is only 1 column, we convert header into a list: [['Name'], 'Tom', 'Harry'] - otherwise issue in converting to upper later when using [x.upper() if x is not None else x for x in data[0]]. Each letter of 'Name' will be separate element
if not isinstance(data[0], list):
data[0]=[data[0]]
lt_type='single_col'
pos_none= [pos for pos, x in enumerate(data[0]) if x == None] # data[0] is the header column. # when using used_range Python may read in some None columns, we find the position of these None columns
if 0 in pos_none: # If column has NONE header, it is probably an index column. We will need an indicator to later remove this index column
index_col='Y'
col_names_first_ls=data[0]
col_names_first_ls = [x for x in col_names_first_ls if x != None] # when using used_range Python may read in some None columns, we drop these
col_names_first= ''.join(str(x).replace(' ', '').upper() for x in col_names_first_ls)
else:
# if there is only 1 column, we convert header into a list: [['Name'], 'Tom', 'Harry'] - otherwise issue in converting to upper later when using [x.upper() if x is not None else x for x in data[0]]. Each letter of 'Name' will be separate element
if not isinstance(data[0], list) and lt_type=='single_col':
pos_none=[]
try:
data[0]=[data[0]]
except: #TypeError: 'str' object does not support item assignment. Occurs when data is a single str/number (ie, there is only 1 column with 1 row)
data=[[data]]
col_names_next_ls=data[0]
elif not isinstance(data[0], list) and lt_type!='single_col': # imply there is only 1 data row without header
col_names_next_ls=data
col_names_next_ls = [x for x in col_names_next_ls if x != None]
else: # imply there is > 1 row and >1 col
pos_none= [pos for pos, x in enumerate(data[0]) if x == None]
col_names_next_ls=data[0]
col_names_next_ls = [x for x in col_names_next_ls if x != None]
col_names_next= ''.join(str(x).replace(' ', '').upper() for x in col_names_next_ls)
# for 1st sheet and 2nd onwards sheets which have same header as 1st sheet
if (ct==0) or ((ct!=0) & (col_names_first==col_names_next)):
data[0]=[x for x in data[0] if x != None] # we remove any None header
# IF THERE IS NONE header column read in, we remove all the data (should all be None) for that column
if pos_none:
for pos, x in enumerate(data[1:]):
i=0
for n in pos_none:
n=n+i
x.pop(n)
i-=1
cols=[str(x).upper() if x is not None else x for x in data[0]] # this line will have issue if data[0] is a single string of the column name
# cols=[]
# if isinstance(data[0], list):
# for x in data[0]:
# if x is not None:
# cols.append(x.upper())
# else: cols.append(x)
# elif not isinstance(data[0], list): cols.append(data[0].upper())
df = pd.DataFrame(data[1:], columns = cols)
df = df[df.columns.dropna()] # drop columns with None as header. Some Excel file have >1 None columns, duplicate columns will have issue with concat
#df.columns = map(str.upper, df.columns) # May not work if header has None
df_merged=pd.concat([df_merged, df],axis=0, ignore_index=True)
elif (ct!=0) & (col_names_first!=col_names_next): # in case the 2nd onwards sheets don't have header (ie. 1st row is data), we need to add the header of the 1st sheet to it in order to concat
if lt_type=='single_col': data[0]=data[0][0] # if only 1 column, the 1st row data shan't be a list [['Tom'], 'Dick']
if len (col_names_next_ls) > len (col_names_first_ls): # Possibly some None header Column read in for ct!=. It could be an index column
# IF THERE IS an index column read in, we remove all the data for that column and don't include it
if index_col=='Y':
for x in data[0:]:
try:
x.pop(0)
except: #AttributeError: 'float'/ 'str' object has no attribute 'pop'. This occurs if it is a single row. And data is a single list (not an embedded list) Eg [10.0, 'tom_left', 20.0, 200.0, 31.0]
data.pop(0)
break
new_col_ls=col_names_first_ls
elif len (col_names_next_ls) < len (col_names_first_ls):
print("Number of column in Sheet " + str(ct) + " is fewer than in Sheet 0.")
raise Exception("Number of Columns in Sheets Differ Error")
elif len (col_names_next_ls) == len (col_names_first_ls):
new_col_ls=col_names_first_ls
cols=[x.upper() if x is not None else x for x in new_col_ls]
try:
df = pd.DataFrame(data[0:], columns = cols)
except: # when data[0:] is a single list with >1 col, sometimes will have ValueError: Shape of passed values is (x, 1), indices imply (x, x) Then will have to nest it in a list
df = pd.DataFrame([data[0:]], columns = cols)
df = df[df.columns.dropna()]
#df.columns = map(str.upper, df.columns)
df_merged=pd.concat([df_merged, df],axis=0, ignore_index=True)
if sheet!='': break
filebook.close()
app.quit()
return df_merged
No comments:
Post a Comment