#!/usr/bin/env python import csv from argparse import ArgumentParser import pandas as pd import pdfplumber def main(): parser = ArgumentParser(description='read tables from PDF file') parser.add_argument('filename', help='PDF file to read') args = parser.parse_args() # read pdf and create a list dataframes dfs = list() with pdfplumber.open(args.filename) as pdf: for pageno, page in enumerate(pdf.pages): tables = page.extract_tables() if tables: dfs.append(pd.DataFrame(tables[0][1:], columns=tables[0][0])) # #print(f'found {len(dfs)} tables') # concatenate dataframes df = pd.concat(dfs) # cleanup df.reset_index(inplace=True) df.drop(labels='index', axis=1, inplace=True) # replace '\n' by ' ' () for column in df.columns: df[column] = df[column].apply(lambda x: x.replace('\n', ' ')) # save df.to_pickle('tables.pkl.gz') df.to_csv('tables.csv.gz') df.to_excel('tables.xlsx') if __name__ == '__main__': main()