Skip to content

Pandas

Pandas is a data manipulation library in python, used for data analysis and cleaning.

It provides two primary structures:

  • Series: one-dimension array like object.
  • DataFrames: two-dimensional, size-mutable tabular data structure.

Install packages

1
2
3
4
!uv pip install -q \
    pandas==2.3.2 \
    pandas-stubs==2.3.2.250827 \
    numpy==2.3.0

Import packages

1
2
3
4
5
6
import sqlite3
from datetime import datetime
from io import StringIO

import numpy as np
import pandas as pd

Series

Create series from List

1
2
3
4
5
6
data_from_list = [1, 2, 3, 4, 5]

series_from_list = pd.Series(data_from_list)

print(f"Series:\n{series_from_list}")
print(type(series_from_list))
Series:

0    1

1    2

2    3

3    4

4    5

dtype: int64

<class 'pandas.core.series.Series'>

Create Series from Dict

1
2
3
4
5
data_from_dict = {"a": 1, "b": 2}

series_from_dict = pd.Series(data_from_dict)

print(f"Series:\n{series_from_dict}")
Series:

a    1

b    2

dtype: int64

Working with Series indexes

1
2
3
4
5
6
7
8
data_from_series_with_index = [10, 20, 30]
series_index = ["a", "b", "c"]

series_from_data_with_index = pd.Series(
    data_from_series_with_index, index=series_index
)

print(f"Series:\n{series_from_data_with_index}")
Series:

a    10

b    20

c    30

dtype: int64

Data frames

Creating DataFrames

DataFrame from a list of lists

data_from_list_of_lists = [
    [1, "Alice", 25],
    [2, "Bob", 30],
    [3, "Charlie", 22],
]

columns_for_list_of_lists = ["ID", "Name", "Age"]

df_from_list_of_lists = pd.DataFrame(
    data_from_list_of_lists,
    columns=columns_for_list_of_lists,
)

df_from_list_of_lists
ID Name Age
0 1 Alice 25
1 2 Bob 30
2 3 Charlie 22

DataFrame from a dictionary of lists

1
2
3
4
5
6
7
data_from_dict_of_lists = {
    "Name": ["Pedro", "James", "John"],
    "Age": [33, 27, 52],
}

dataframe_dict_of_lists = pd.DataFrame(data_from_dict_of_lists)
dataframe_dict_of_lists
Name Age
0 Pedro 33
1 James 27
2 John 52

DataFrame from a list of dicts

1
2
3
4
5
6
7
8
data_from_list_of_dicts = [
    {"Name": "Pedro", "Age": 33},
    {"Name": "James", "Age": 27},
    {"Name": "John", "Age": 52},
]

dataframe_from_list_of_dicts = pd.DataFrame(data_from_list_of_dicts)
dataframe_from_list_of_dicts
Name Age
0 Pedro 33
1 James 27
2 John 52

DataFrame from json string

1
2
3
4
data_from_json_string = '[{"Name": "Pedro", "Age": 33}]'

dataframe_from_json_string = pd.read_json(StringIO(data_from_json_string))
dataframe_from_json_string
Name Age
0 Pedro 33

Convert DataFrame back to json

dataframe_from_json_string.to_json()
'{"Name":{"0":"Pedro"},"Age":{"0":33}}'

Change DataFrame orientation to index

dataframe_from_json_string.to_json(orient="index")
'{"0":{"Name":"Pedro","Age":33}}'

Change DataFrame orientation to records

dataframe_from_json_string.to_json(orient="records")
'[{"Name":"Pedro","Age":33}]'

Dataframe data types

dataframe_from_json_string.dtypes
Name    object
Age      int64
dtype: object

First rows of a DataFrame

1
2
3
4
5
6
7
8
9
data_for_methods_examples = [
    {"Name": "Pedro", "Age": 33},
    {"Name": "James", "Age": 27},
    {"Name": "John", "Age": 52},
]

dataframe_for_methods_examples = pd.DataFrame(data_for_methods_examples)

dataframe_for_methods_examples.head(n=2)
Name Age
0 Pedro 33
1 James 27

Information about a DataFrame

dataframe_for_methods_examples.info()
<class 'pandas.core.frame.DataFrame'>

RangeIndex: 3 entries, 0 to 2

Data columns (total 2 columns):

 #   Column  Non-Null Count  Dtype 

---  ------  --------------  ----- 

 0   Name    3 non-null      object

 1   Age     3 non-null      int64 

dtypes: int64(1), object(1)

memory usage: 180.0+ bytes

Describing a DataFrame

dataframe_for_methods_examples.describe()
Age
count 3.000000
mean 37.333333
std 13.051181
min 27.000000
25% 30.000000
50% 33.000000
75% 42.500000
max 52.000000

Last rows of a DataFrame

dataframe_for_methods_examples.tail(2)
Name Age
1 James 27
2 John 52

Renaming columns in a DataFrame

dataframe_for_methods_examples.rename({"Name": "First Name"})
Name Age
0 Pedro 33
1 James 27
2 John 52

Accessing DataFrame elements

Accessing a DataFrame Serie

print(dataframe_for_methods_examples["Name"])
print(type(dataframe_for_methods_examples["Name"]))
0    Pedro

1    James

2     John

Name: Name, dtype: object

<class 'pandas.core.series.Series'>

DataFrame Index

data_for_index_usage = [
    [1, "Alice", 25],
    [2, "Bob", 30],
    [3, "Charlie", 22],
]

columns_for_index_usage = ["ID", "Name", "Age"]

dataframe_for_index_usage = pd.DataFrame(
    data_for_index_usage, columns=columns_for_index_usage
)

dataframe_for_index_usage.head(n=2)
ID Name Age
0 1 Alice 25
1 2 Bob 30
dataframe_for_index_usage.index
RangeIndex(start=0, stop=3, step=1)

Replacing index

1
2
3
dataframe_for_index_usage.index = ["a", "b", "c"]

dataframe_for_index_usage.head(n=2)
ID Name Age
a 1 Alice 25
b 2 Bob 30

Accessing a DataFrame row index

dataframe_for_index_usage.loc["a"]
ID          1
Name    Alice
Age        25
Name: a, dtype: object

Accessing a DataFrame by position with iloc

dataframe_for_index_usage.iloc[1]
ID        2
Name    Bob
Age      30
Name: b, dtype: object

Accessing a value from a specific row/column at index with at

dataframe_for_index_usage.at["a", "Age"]
np.int64(25)

Accessing a value from a specific row/column at position with iat

dataframe_for_index_usage.iat[2, 1]
'Charlie'

Resetting index

dataframe_for_index_usage.reset_index()
index ID Name Age
0 a 1 Alice 25
1 b 2 Bob 30
2 c 3 Charlie 22

Modifying DataFrames

1
2
3
4
5
6
7
8
data_for_manipulation = [
    {"Name": "Pedro", "Age": 33},
    {"Name": "James", "Age": 27},
    {"Name": "John", "Age": 52},
]

dataframe_for_manipulation = pd.DataFrame(data_for_manipulation)
dataframe_for_manipulation.head(2)
Name Age
0 Pedro 33
1 James 27

Adding a new column to a DataFrame

dataframe_for_manipulation["City"] = ["New York", "Florida", "Los Angeles"]
dataframe_for_manipulation
Name Age City
0 Pedro 33 New York
1 James 27 Florida
2 John 52 Los Angeles

Dropping a column, returns a new DataFrame unless inplace=True

dataframe_for_manipulation.drop("City", axis=1)  # axis=0 rows, axis=1 columns
Name Age
0 Pedro 33
1 James 27
2 John 52

Deleting a column, modifies the original DataFrame

del dataframe_for_manipulation["City"]

Removing a row

dataframe_for_manipulation.drop(1, axis=0)
Name Age
0 Pedro 33
2 John 52

Element-wise operations on DataFrames

dataframe_for_manipulation["Age"] = dataframe_for_manipulation["Age"] + 1
dataframe_for_manipulation
Name Age
0 Pedro 34
1 James 28
2 John 53

Filtering DataFrames

1
2
3
4
5
6
7
8
9
dataframe_for_filtering = pd.DataFrame(
    [
        {"Name": "Pedro", "Age": 33},
        {"Name": "James", "Age": 27},
        {"Name": "John", "Age": 52},
        {"Name": "Alice", "Age": 30},
        {"Name": "Bob", "Age": 22},
    ]
)

Filter DataFrame based on a condition

1
2
3
4
dataframe_for_filtering[
    (dataframe_for_filtering["Age"] > 30)
    & (dataframe_for_filtering["Name"].str.startswith("J"))
]
Name Age
2 John 52

String operations on DataFrames

1
2
3
4
5
6
7
8
dataframe_for_string_operations = pd.DataFrame(
    {
        "Name": ["Pedro", "James", "John"],
        "City": ["New York", "Florida", "Los Angeles"],
    }
)

dataframe_for_string_operations["Name"].str.lower()
0    pedro
1    james
2     john
Name: Name, dtype: object
dataframe_for_string_operations["City"].str.replace(" ", "_").str.lower()
0       new_york
1        florida
2    los_angeles
Name: City, dtype: object

Handling missing data

Handling missing values

1
2
3
4
5
6
7
8
9
data_with_missing = [
    {"Name": "Pedro", "Age": 33},
    {"Name": "James", "Age": None},
    {"Name": "John", "Age": 52},
]

dataframe_with_missing = pd.DataFrame(data_with_missing)

dataframe_with_missing.isnull().any()
Name    False
Age      True
dtype: bool
dataframe_with_missing.isnull().sum()
Name    0
Age     1
dtype: int64
dataframe_with_missing[dataframe_with_missing.isnull().any(axis=1)]
Name Age
1 James NaN
dataframe_with_missing.fillna(0)
Name Age
0 Pedro 33.0
1 James 0.0
2 John 52.0
1
2
3
4
dataframe_with_missing["Age_fill_NA"] = dataframe_with_missing["Age"].fillna(
    dataframe_with_missing["Age"].mean()
)
dataframe_with_missing
Name Age Age_fill_NA
0 Pedro 33.0 33.0
1 James NaN 42.5
2 John 52.0 52.0

Casting Data

1
2
3
4
5
6
7
8
9
data_for_casting = [
    {"Name": "Pedro", "Age": 33.0},
    {"Name": "James", "Age": 27.0},
    {"Name": "John", "Age": 52.0},
]

dataframe_for_casting = pd.DataFrame(data_for_casting)

dataframe_for_casting.head()
Name Age
0 Pedro 33.0
1 James 27.0
2 John 52.0

Casting a DataFrame Series to integer

dataframe_for_casting["Age"] = dataframe_for_casting["Age"].astype(int)
dataframe_for_casting
Name Age
0 Pedro 33
1 James 27
2 John 52

Applying function on column

Applying a lambda function to a DataFrame Series

1
2
3
4
5
6
7
current_year = datetime.now().year

dataframe_for_casting["BirthYear"] = dataframe_for_casting["Age"].apply(
    lambda x: current_year - x
)

dataframe_for_casting.head()
Name Age BirthYear
0 Pedro 33 1992
1 James 27 1998
2 John 52 1973

Summarizing Operations

size = 10

start_date = pd.to_datetime("2023-01-01")

data_for_summarization = {
    "order_id": np.arange(1, size + 1),
    "product": np.random.choice(
        ["Phone", "Laptop", "Keyboard", "Mouse", "Monitor"], size=size
    ),
    "region": np.random.choice(["North", "South", "East", "West"], size=size),
    "price": np.random.randint(100, 2000, size=size),
    "quantity": np.random.randint(1, 10, size=size),
    "order_date": start_date
    + pd.to_timedelta(np.random.randint(0, 365, size=size), unit="D"),
}

dataframe_for_summarization = pd.DataFrame(data_for_summarization)

dataframe_for_summarization.head(10)
order_id product region price quantity order_date
0 1 Laptop South 1913 9 2023-08-10
1 2 Phone East 1858 9 2023-07-16
2 3 Monitor South 229 3 2023-04-26
3 4 Keyboard North 1125 1 2023-11-19
4 5 Laptop East 688 2 2023-03-10
5 6 Laptop West 482 4 2023-11-06
6 7 Monitor East 1371 7 2023-06-12
7 8 Mouse West 800 1 2023-07-19
8 9 Mouse South 436 8 2023-07-17
9 10 Laptop South 369 8 2023-07-10

Summing a DataFrame Series

dataframe_for_summarization.quantity.sum()
np.int64(52)

Descriptive statistics of a DataFrame Series

dataframe_for_summarization.quantity.describe()
count    10.00000
mean      5.20000
std       3.32666
min       1.00000
25%       2.25000
50%       5.50000
75%       8.00000
max       9.00000
Name: quantity, dtype: float64

Finding unique values in a DataFrame Series

dataframe_for_summarization["product"].unique()
array(['Laptop', 'Phone', 'Monitor', 'Keyboard', 'Mouse'], dtype=object)

Finding number of unique values in a DataFrame Series

dataframe_for_summarization["product"].nunique()
5

Grouping

Make a mock dataset

size = 10

start_date = pd.to_datetime("2023-01-01")

data_for_aggregation = {
    "order_id": np.arange(1, size + 1),
    "product": np.random.choice(
        ["Phone", "Laptop", "Keyboard", "Mouse", "Monitor"], size=size
    ),
    "region": np.random.choice(["North", "South", "East", "West"], size=size),
    "price": np.random.randint(100, 2000, size=size),
    "quantity": np.random.randint(1, 10, size=size),
    "order_date": start_date
    + pd.to_timedelta(np.random.randint(0, 365, size=size), unit="D"),
}

dataframe_for_aggregation = pd.DataFrame(data_for_aggregation)

dataframe_for_aggregation.head(10)
order_id product region price quantity order_date
0 1 Phone East 1255 5 2023-02-11
1 2 Keyboard South 1054 5 2023-04-15
2 3 Keyboard South 133 3 2023-07-23
3 4 Mouse East 1983 1 2023-09-11
4 5 Mouse North 958 6 2023-10-30
5 6 Keyboard West 456 1 2023-01-15
6 7 Phone South 297 3 2023-12-24
7 8 Phone West 1787 8 2023-09-04
8 9 Laptop East 1978 7 2023-10-29
9 10 Monitor South 1482 6 2023-07-07

Group by a column and sum

dataframe_for_aggregation.groupby("product").quantity.sum().reset_index()
product quantity
0 Keyboard 9
1 Laptop 7
2 Monitor 6
3 Mouse 7
4 Phone 16

Group by multiple columns and sum

1
2
3
dataframe_for_aggregation.groupby(
    ["region", "product"]
).quantity.sum().reset_index()
region product quantity
0 East Laptop 7
1 East Mouse 1
2 East Phone 5
3 North Mouse 6
4 South Keyboard 8
5 South Monitor 6
6 South Phone 3
7 West Keyboard 1
8 West Phone 8

Aggregate multiple functions

1
2
3
dataframe_for_aggregation.groupby("product").quantity.agg(
    ["mean", "sum", "count"]
).reset_index()
product mean sum count
0 Keyboard 3.000000 9 3
1 Laptop 7.000000 7 1
2 Monitor 6.000000 6 1
3 Mouse 3.500000 7 2
4 Phone 5.333333 16 3

Merging and Joining DataFrames

1
2
3
4
5
6
dataframe_for_join_a = pd.DataFrame(
    {"Key": ["A", "B", "C"], "Value": [1, 2, 3]}
)
dataframe_for_join_b = pd.DataFrame(
    {"Key": ["A", "B", "D"], "Value": [4, 5, 6]}
)

Merging inner

\(A \cap B\)

pd.merge(dataframe_for_join_a, dataframe_for_join_b, on="Key", how="inner")
Key Value_x Value_y
0 A 1 4
1 B 2 5

Merging outer

\(A \cup B\)

pd.merge(dataframe_for_join_a, dataframe_for_join_b, on="Key", how="outer")
Key Value_x Value_y
0 A 1.0 4.0
1 B 2.0 5.0
2 C 3.0 NaN
3 D NaN 6.0

Merging left

pd.merge(dataframe_for_join_a, dataframe_for_join_b, on="Key", how="left")
Key Value_x Value_y
0 A 1 4.0
1 B 2 5.0
2 C 3 NaN

Merging right

pd.merge(dataframe_for_join_a, dataframe_for_join_b, on="Key", how="right")
Key Value_x Value_y
0 A 1.0 4
1 B 2.0 5
2 D NaN 6

Reading the result of a SQL query into a DataFrame

dataframe_for_sqlite_queries = pd.DataFrame(
    [[1, "Alice"], [2, "Bob"], [3, "Charlie"], [4, "Diana"], [5, "Ethan"]],
    columns=["ID", "Name"],
)

sqlite_database_for_pandas_queries = sqlite3.connect(":memory:")

dataframe_for_sqlite_queries.to_sql(
    "users",
    sqlite_database_for_pandas_queries,
    index=False,
    if_exists="replace",
)

dataframe_from_sql_query = pd.read_sql_query(
    "SELECT * FROM users WHERE ID <= 3", sqlite_database_for_pandas_queries
)

sqlite_database_for_pandas_queries.close()

dataframe_from_sql_query
ID Name
0 1 Alice
1 2 Bob
2 3 Charlie