# pandas

pandas is a Python library for data manipulation and analysis.

Panda Eating Bamboo

# Libaries

# Notes

# Snippets

# Connect to a SQLite database

import pandas as pd
import sqlite

conn = sqlite3.connect("database.sqlite")
df = pd.read_sql_query("SELECT * FROM table_name;", conn)

df.head()
1
2
3
4
5
6
7

# Using a SQLAlchemy engine to connect to a database

import pandas as pd
import numpy as np
from sqlalchemy import create_engine

engine = create_engine("postgresql:///database")

df = pd.read_sql_query("SELECT * FROM table;", con=engine)

df.head()
1
2
3
4
5
6
7
8
9

# Python compatible column names with slugify

Usually I'm dealing with data from external sources that don't have pretty columns names. I like to use slugify (opens new window) to convert them to Python compatible keys.

from slugify import slugify

df.columns = [slugify(c, separator="_", to_lower=True) for c in df.columns]
1
2
3

# Read CSV file with all cells as strings

>>> df = pd.read_csv("data/source/example.csv", dtype=str)
>>> df.dtypes
ID                         object
NAME                       object
SALARY                     object
1
2
3
4
5

# Traspose DataFrame and view all rows

>>> with pd.option_context("display.max_rows", None):
...    print(df.head(1).transpose())
ID                         1
NAME       Myles Braithwaite
SALARY               $10,000
1
2
3
4
5

# Convert a column from continuous to categorical

>>> df["age_groups"] = pd.cut(
...     df.age,
...     bins=[0, 18, 65, 99],
...     labels=["child", "adult", "elderly"]
... )
1
2
3
4
5

Kevin Markham (justmarkham (opens new window)) - https://twitter.com/justmarkham/status/1146040449678925824 (opens new window)

# Read a CSV file data in chunk size

Sometimes a CSV is just to large for the memory on your computer. You can tell the argument chunksize how many rows of data you would like to load.

df_chunks = pd.read_csv("data/source/data.csv", chunksize=10000)

df_chunks[0].head()
1
2
3

If you would like to load the scale down the data and load the it into one pd.DataFrame:

def chunk_preprocessing(df):
    """Preprocess a chunk of the data before analysis.

    Arguments
    ————————-
    df : `pd.DataFrame`
        The chunk of the data source that needs to be processed.
    """
    pass


df_chunks_processed = [
    chunk_preprocessing(df)
    for df in df_chunks
]

df_concat = pd.concat(df_chunks_processed)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

# Pandas/SQL Rosetta Stone

# IN / pandas.DataFrame.isin

SELECT *
FROM table
WHERE
    city IN (
        "Toronto",
        "Richmond Hill"
    );
1
2
3
4
5
6
7
# City is ether Toronto or Richmond Hill:
df[df['city'].isin(['Toronto', 'Richmond Hill'])]

# City is not Markdale or Oakville:
df[~df['city'].isin(['Markdale', 'Oakville'])]
1
2
3
4
5

See the pandas documentation for more information on pandas.DataFrame.isin (opens new window).

Last Updated: 12/26/2022, 5:42:03 PM