piper.io.read_excels

piper.io.read_excels(source='inputs/', glob_pattern='*.xls*', func=None, include_file=False, reset_index=True, info=False)[source]

Read, concatenate, combine and return a pandas dataframe based on workbook(s) data within a given source folder.

Parameters
  • source – source folder containing workbook data (that is, files ending with ‘.xls*’)

  • glob_pattern – file extension filter (str), default - ‘.xls

  • func

    pass a custom function to read/transform each workbook/sheet. default is None (just performs a read_excel())

    Note: custom function will recieve Path object. To obtain the ‘string’ filename - use Path.as_posix()

  • include_file – include filename in returned dataframe - default False

  • reset_index – default True

  • info – Provide debugging information - default False

Returns

Return type

pd.DataFrame - pandas DataFrame

Examples

Using a custom function, use below as a guide.

def clean_data(xl_file):

    df = pd.read_excel(xl_file.as_posix(), header=None)

    # Combine first two rows to correct column headings
    df.columns = df.loc[0].values + ' ' + df.loc[1].values

    # Read remaining data and reference back to dataframe reference
    df = df.iloc[2:]

    # clean up column names to make it easier to use them in calculations
    df = clean_names(df)

    # Set date data types for order and ship dates
    cols = ['order_date', 'ship_date']
    date_data = [pd.to_datetime(df[col]) for col in cols]
    df[cols] = pd.concat(date_data, axis=1)

    # Separate shipping mode from container
    df_split = df['ship_mode_container'].str.extract(r'(.*)-(.*)')
    df_split.columns = ['ship', 'container']

    priority_categories = ['Low', 'Medium', 'High', 'Critical', 'Not Specified']
    priority_categories = pd.CategoricalDtype(priority_categories, ordered=True)

    # split out and define calculated fields using lambdas
    sales_amount = lambda x: (x.order_quantity * x.unit_sell_price
                              * (1 - x.discount_percent)).astype(float).round(2)

    days_to_ship=lambda x: ((x.ship_date - x.order_date) / pd.Timedelta(1, 'day')).astype(int)

    sales_person=lambda x: x.sales_person.str.extract('(Mr|Miss|Mrs) (\w+) (\w+)').loc[:, 1]

    order_priority=lambda x: x.order_priority.astype(priority_categories)

    # Define/assign new column (values)
    df = (df.assign(ship_mode=df_split.ship,
                    container=df_split.container,
                    sales_amount=sales_amount,
                    days_to_ship=days_to_ship,
                    sales_person=sales_person,
                    order_priority=order_priority)
            .drop(columns=['ship_mode_container'])
            .pipe(move_column, 'days_to_ship', 'after', 'ship_date')
            .pipe(move_column, 'sales_amount', 'after', 'unit_sell_price')
            .pipe(clean_names, replace_char=('_', ' '), title=True)
         )

    return df

data = read_excels('inputs/Data', func=clean_data)
head(data, 2)