Wednesday, 10 July 2024

How to Read in 1 or All Excel Sheets into a Pandas DF Using Xlwings?

 

                                                    Photo by Jeff Sheldon on Unsplash

One of the advantages of using Xlwings to read in Excel data into Pandas dataframe is that numeric columns that should be stored as string (ie. leading zeros are retained) will be object Dtype. Pandas.read_excel() will, instead, convert such columns into int64 Dtype (ie. leading zeros are removed).

I have created a function that can read in a single or all sheets of an Excel into a pandas df.

 



import pandas as pd
import os
import xlwings as xw

 

# =============================================================================

# INSTRUCTIONS FOR read_pw_excel

# Function can also read in CSV files. But as the underlying software is Excel, if the CSV file exceeds the maximum number of rows allowed in Excel, only the 1st row up to the limit will be read in.

 

# 1) Leave sheet blank to read in all sheets in the Excel file.

# Output will be a vertically stacked df of all sheets. Otherwise, specifiy the exact sheet to read in for df.

# If reading in all sheets,

   # Code will read in the 1st row of the 1st sheet as the header.

   # Whether subsequent sheets have header row or do not have header row (ie all rows are data), code will use the 1st row of the 1st sheet as the header.

# Code will drop columns with None as header (including the index column if it doesn't have header)

# Code will convert all header to upper case

 

# 2) Leave excel_range blank to read in the used range in the sheet(s). Otherwise specify the exact range like 'A:B'.

# 3) Leave pw blank if Excel file is not password protected

# =============================================================================

 

def read_pw_excel(link, sheet='', excel_range = '', pw=''):

 

    if not os.path.exists(link):

        print("link does not exist. Please try again.")

        raise Exception("LinkNotFoundError")

       

    df_merged=pd.DataFrame()

    col_names_first=''

    col_names_next=''

    lt_type=''

    index_col=''

    app = xw.App()

    app.display_alerts=False

   

    if pw=='':

        #filebook = xw.Book(link)

        filebook = app.books.open(link, update_links=False)

    else: filebook = app.books.open(link, update_links=False, password=pw)

   

    

    sht_ct = filebook.sheets.count

 

    for ct in range(0, sht_ct):

 

        if sheet=='':         

            if excel_range == '':

                range_to_use = filebook.sheets[ct].used_range  

            else: range_to_use = excel_range

       

            data = filebook.sheets[ct].range(range_to_use).value # Note this will be a list Eg [['Name', 'Age'], ['Tom', 4], ['Harry', 3]]. If there is only 1 column or 1 row, it will be ['Name', 'Tom', 'Harry'] or [ 'Tom', 'Tan', 6]

           

        else:

            if excel_range == '':

                range_to_use = filebook.sheets[sheet].used_range            

            else: range_to_use = excel_range

       

            data=filebook.sheets[sheet].range(range_to_use).value

 

 

 

        if ct==0:

           

            # if there is only 1 column, we convert header into a list: [['Name'], 'Tom', 'Harry'] - otherwise issue in converting to upper later when using [x.upper() if x is not None else x for x in data[0]]. Each letter of 'Name' will be separate element

            if not isinstance(data[0], list):

                data[0]=[data[0]]

                lt_type='single_col'

       

            

            pos_none= [pos for pos, x in enumerate(data[0]) if x == None] # data[0] is the header column. # when using used_range Python may read in some None columns, we find the position of these None columns

            if 0 in pos_none: # If column has NONE header, it is probably an index column. We will need an indicator to later remove this index column

                index_col='Y'

               

                

            col_names_first_ls=data[0]

            col_names_first_ls = [x for x in col_names_first_ls if x != None] # when using used_range Python may read in some None columns, we drop these

            col_names_first= ''.join(str(x).replace(' ', '').upper() for x in col_names_first_ls)

            

        else:

            

            # if there is only 1 column, we convert header into a list: [['Name'], 'Tom', 'Harry'] - otherwise issue in converting to upper later when using [x.upper() if x is not None else x for x in data[0]]. Each letter of 'Name' will be separate element

            if not isinstance(data[0], list) and lt_type=='single_col':

                pos_none=[]

              

                try:

                    data[0]=[data[0]]

                except: #TypeError: 'str' object does not support item assignment. Occurs when data is a single str/number (ie, there is only 1 column with 1 row)

                    data=[[data]]

                   

                col_names_next_ls=data[0]

               

                

                

            elif not isinstance(data[0], list) and lt_type!='single_col':  # imply there is only 1 data row without header

                col_names_next_ls=data

                col_names_next_ls = [x for x in col_names_next_ls if x != None]

           

            else: # imply there is > 1 row and >1 col

                pos_none= [pos for pos, x in enumerate(data[0]) if x == None]

                

                col_names_next_ls=data[0]

                col_names_next_ls = [x for x in col_names_next_ls if x != None]

               

            

            col_names_next= ''.join(str(x).replace(' ', '').upper() for x in col_names_next_ls)

           

       

        

       

        # for 1st sheet and 2nd onwards sheets which have same  header as 1st sheet

        if (ct==0) or ((ct!=0) & (col_names_first==col_names_next)):

            

            data[0]=[x for x in data[0] if x != None] # we remove any None header

 

            # IF THERE IS NONE header column read in, we remove all the data (should all be None) for that column

            if pos_none:

                for pos, x in enumerate(data[1:]):

                    i=0

                    for n in pos_none:

                        n=n+i

                        x.pop(n)

                        i-=1

 

            cols=[str(x).upper() if x is not None else x for x in data[0]] # this line will have issue if data[0]  is  a single string of the column name

            # cols=[]

            # if isinstance(data[0], list):

            #     for x in data[0]:

            #         if x is not None:

            #             cols.append(x.upper())

            #         else: cols.append(x)

           

            # elif not isinstance(data[0], list): cols.append(data[0].upper())

               

            df = pd.DataFrame(data[1:], columns = cols)

            df = df[df.columns.dropna()]  # drop columns with None as header. Some Excel file have >1 None columns, duplicate columns will have issue with concat

            #df.columns = map(str.upper, df.columns) # May not work if header has None 

            df_merged=pd.concat([df_merged, df],axis=0, ignore_index=True)

           

        elif (ct!=0) & (col_names_first!=col_names_next): # in case the 2nd onwards sheets don't have header (ie. 1st row is data), we need to add the header of the 1st sheet to it in order to concat

            if lt_type=='single_col': data[0]=data[0][0] # if only 1 column, the 1st row data shan't be a list [['Tom'], 'Dick']

               

            if len (col_names_next_ls) > len (col_names_first_ls): # Possibly some None header Column read in for ct!=. It could be an index column

                # IF THERE IS an index column read in, we remove all the data for that column and don't include it

       

                if index_col=='Y':

                    for x in data[0:]:

                        try:

                            x.pop(0)

                        except:  #AttributeError: 'float'/ 'str' object has no attribute 'pop'. This occurs if it is a single row. And data is a single list (not an embedded list) Eg [10.0, 'tom_left', 20.0, 200.0, 31.0]

                            data.pop(0)

                            break

                       

                new_col_ls=col_names_first_ls

 

                      

            elif len (col_names_next_ls) < len (col_names_first_ls):

                print("Number of column in Sheet " + str(ct) + " is fewer than in Sheet 0.")

                raise Exception("Number of Columns in Sheets Differ Error")

              

               

            elif len (col_names_next_ls) == len (col_names_first_ls):

                 new_col_ls=col_names_first_ls

             

        

            cols=[x.upper() if x is not None else x for x in new_col_ls]

           

            try:

                df = pd.DataFrame(data[0:], columns = cols)

 

            except: # when data[0:] is a single list with >1 col, sometimes will have ValueError: Shape of passed values is (x, 1), indices imply (x, x)  Then will have to nest it in a list

                df = pd.DataFrame([data[0:]], columns = cols)

          

            df = df[df.columns.dropna()]

            #df.columns = map(str.upper, df.columns)

            df_merged=pd.concat([df_merged, df],axis=0, ignore_index=True)

        

                

                

            

            

        if sheet!='': break

 

    filebook.close()

    app.quit()

   

    return df_merged

No comments:

Post a Comment

How to Read in 1 or All Excel Sheets into a Pandas DF Using Xlwings?

                                                                   Photo by Jeff Sheldon on Unsplash One of the advantages of using Xlwings...