Skip to content

Pandas

Pandas is a data manipulation library in python, used for data analysis and cleaning.

It provides two primary structures:

  • Series: one-dimension array like object.
  • DataFrames: two-dimensional, size-mutable tabular data structure.

Install packages

You can use both pip or astral/uv.

1
2
3
4
!uv pip install -q \
    pandas==2.3.2 \
    pandas-stubs==2.3.2.250827 \
    numpy==2.3.0

Import packages

1
2
3
4
5
from datetime import datetime
from io import StringIO

import numpy as np
import pandas as pd

Usage

Series

Create series from List

1
2
3
4
5
6
data_from_list = [1, 2, 3, 4, 5]

series_from_list = pd.Series(data_from_list)

print(f"Series:\n{series_from_list}")
print(type(series_from_list))

Series:

0 1

1 2

2 3

3 4

4 5

dtype: int64

Create Series from Dict

1
2
3
4
5
data_from_dict = {"a": 1, "b": 2}

series_from_dict = pd.Series(data_from_dict)

print(f"Series:\n{series_from_dict}")

Series:

a 1

b 2

dtype: int64

Working with Series indexes

1
2
3
4
5
6
7
8
data_from_series_with_index = [10, 20, 30]
series_index = ["a", "b", "c"]

series_from_data_with_index = pd.Series(
    data_from_series_with_index, index=series_index
)

print(f"Series:\n{series_from_data_with_index}")

Series:

a 10

b 20

c 30

dtype: int64

Data frames

Loading Data

Data frame from a dictionary of lists
1
2
3
4
5
6
7
data_from_dict_of_lists = {
    "Name": ["Pedro", "James", "John"],
    "Age": [33, 27, 52],
}

dataframe_dict_of_lists = pd.DataFrame(data_from_dict_of_lists)
dataframe_dict_of_lists
Name Age
0 Pedro 33
1 James 27
2 John 52
Data frame from a list of dicts
1
2
3
4
5
6
7
8
data_from_list_of_dicts = [
    {"Name": "Pedro", "Age": 33},
    {"Name": "James", "Age": 27},
    {"Name": "John", "Age": 52},
]

dataframe_from_list_of_dicts = pd.DataFrame(data_from_list_of_dicts)
dataframe_from_list_of_dicts
Name Age
0 Pedro 33
1 James 27
2 John 52
Data frame from json

From json string

1
2
3
4
data_from_json_string = '[{"Name": "Pedro", "Age": 33}]'

dataframe_from_json_string = pd.read_json(StringIO(data_from_json_string))
dataframe_from_json_string
Name Age
0 Pedro 33

Convert DataFrame back to json

dataframe_from_json_string.to_json()

'{"Name":{"0":"Pedro"},"Age":{"0":33}}'

dataframe_from_json_string.to_json(orient="index")

'{"0":{"Name":"Pedro","Age":33}}'

dataframe_from_json_string.to_json(orient="records")

'[{"Name":"Pedro","Age":33}]'

Dataframe attributes

dataframe_from_json_string.dtypes

Name object Age int64 dtype: object

Dataframe methods

DataFrame.head
1
2
3
4
5
6
7
8
data_for_methods_examples = [
    {"Name": "Pedro", "Age": 33},
    {"Name": "James", "Age": 27},
    {"Name": "John", "Age": 52},
]

dataframe_for_methods_examples = pd.DataFrame(data_for_methods_examples)
dataframe_for_methods_examples.head(2)
Name Age
0 Pedro 33
1 James 27

DataFrame.info

dataframe_for_methods_examples.info()

RangeIndex: 3 entries, 0 to 2

Data columns (total 2 columns):

# Column Non-Null Count Dtype

--- ------ -------------- -----

0 Name 3 non-null object

1 Age 3 non-null int64

dtypes: int64(1), object(1)

memory usage: 180.0+ bytes

DataFrame.describe
dataframe_for_methods_examples.describe()
Age
count 3.000000
mean 37.333333
std 13.051181
min 27.000000
25% 30.000000
50% 33.000000
75% 42.500000
max 52.000000
DataFrame.tail
dataframe_for_methods_examples.tail(2)
Name Age
1 James 27
2 John 52
Renaming columns
dataframe_for_methods_examples.rename({"Name": "First Name"})
Name Age
0 Pedro 33
1 James 27
2 John 52
Accessing a Series
print(dataframe_for_methods_examples["Name"])
print(type(dataframe_for_methods_examples["Name"]))

0 Pedro

1 James

2 John

Name: Name, dtype: object

Accessing an row index
dataframe_for_methods_examples.loc[0]

Name Pedro Age 33 Name: 0, dtype: object

Accessing a particular value
dataframe_for_methods_examples.at[1, "Age"]

np.int64(27)

dataframe_for_methods_examples.iat[2, 1]

np.int64(52)

Dataframe manipulation

1
2
3
4
5
6
7
8
data_for_manipulation = [
    {"Name": "Pedro", "Age": 33},
    {"Name": "James", "Age": 27},
    {"Name": "John", "Age": 52},
]

dataframe_for_manipulation = pd.DataFrame(data_for_manipulation)
dataframe_for_manipulation.head(2)
Name Age
0 Pedro 33
1 James 27
Adding a new column
dataframe_for_manipulation["City"] = ["New York", "Florida", "Los Angeles"]
dataframe_for_manipulation
Name Age City
0 Pedro 33 New York
1 James 27 Florida
2 John 52 Los Angeles
Removing a column
dataframe_for_manipulation.drop("City", axis=1)  # axis=0 rows, axis=1 columns
Name Age
0 Pedro 33
1 James 27
2 John 52
Removing a row
dataframe_for_manipulation.drop(1, axis=0)
Name Age City
0 Pedro 33 New York
2 John 52 Los Angeles
Modify column value
dataframe_for_manipulation["Age"] = dataframe_for_manipulation["Age"] + 1
dataframe_for_manipulation
Name Age City
0 Pedro 34 New York
1 James 28 Florida
2 John 53 Los Angeles
Handling missing values
1
2
3
4
5
6
7
8
9
data_with_missing = [
    {"Name": "Pedro", "Age": 33},
    {"Name": "James", "Age": None},
    {"Name": "John", "Age": 52},
]

dataframe_with_missing = pd.DataFrame(data_with_missing)

dataframe_with_missing.isnull().any()

Name False Age True dtype: bool

dataframe_with_missing.isnull().sum()

Name 0 Age 1 dtype: int64

dataframe_with_missing[dataframe_with_missing.isnull().any(axis=1)]
Name Age
1 James NaN
dataframe_with_missing.fillna(0)
Name Age
0 Pedro 33.0
1 James 0.0
2 John 52.0
1
2
3
4
dataframe_with_missing["Age_fill_NA"] = dataframe_with_missing["Age"].fillna(
    dataframe_with_missing["Age"].mean()
)
dataframe_with_missing
Name Age Age_fill_NA
0 Pedro 33.0 33.0
1 James NaN 42.5
2 John 52.0 52.0

Data manipulation

Casting
1
2
3
4
5
6
7
8
9
data_for_casting = [
    {"Name": "Pedro", "Age": 33.0},
    {"Name": "James", "Age": 27.0},
    {"Name": "John", "Age": 52.0},
]

dataframe_for_casting = pd.DataFrame(data_for_casting)

dataframe_for_casting.head()
Name Age
0 Pedro 33.0
1 James 27.0
2 John 52.0

Casting a DataFrame Series to integer

dataframe_for_casting["Age"] = dataframe_for_casting["Age"].astype(int)
dataframe_for_casting
Name Age
0 Pedro 33
1 James 27
2 John 52
Applying function on column

Applying a lambda function to a DataFrame Series

1
2
3
4
5
6
7
current_year = datetime.now().year

dataframe_for_casting["BirthYear"] = dataframe_for_casting["Age"].apply(
    lambda x: current_year - x
)

dataframe_for_casting.head()
Name Age BirthYear
0 Pedro 33 1992
1 James 27 1998
2 John 52 1973
Aggregation

Make a mock dataset

size = 10

start_date = pd.to_datetime("2023-01-01")

data_for_aggregation = {
    "order_id": np.arange(1, size + 1),
    "product": np.random.choice(
        ["Phone", "Laptop", "Keyboard", "Mouse", "Monitor"], size=size
    ),
    "region": np.random.choice(["North", "South", "East", "West"], size=size),
    "price": np.random.randint(100, 2000, size=size),
    "quantity": np.random.randint(1, 10, size=size),
    "order_date": start_date
    + pd.to_timedelta(np.random.randint(0, 365, size=size), unit="D"),
}

dataframe_for_aggregation = pd.DataFrame(data_for_aggregation)

dataframe_for_aggregation.head(10)
order_id product region price quantity order_date
0 1 Laptop North 655 5 2023-05-11
1 2 Keyboard East 892 9 2023-05-11
2 3 Phone South 1092 7 2023-06-23
3 4 Keyboard West 1864 5 2023-05-08
4 5 Phone East 1287 8 2023-06-08
5 6 Mouse North 1417 1 2023-02-09
6 7 Keyboard West 1299 2 2023-08-18
7 8 Monitor South 282 4 2023-07-23
8 9 Laptop East 236 8 2023-01-04
9 10 Laptop East 844 1 2023-06-11

Group by a column and sum

dataframe_for_aggregation.groupby("product").quantity.sum().reset_index()
product quantity
0 Keyboard 16
1 Laptop 14
2 Monitor 4
3 Mouse 1
4 Phone 15

Group by multiple columns and sum

1
2
3
dataframe_for_aggregation.groupby(
    ["region", "product"]
).quantity.sum().reset_index()
region product quantity
0 East Keyboard 9
1 East Laptop 9
2 East Phone 8
3 North Laptop 5
4 North Mouse 1
5 South Monitor 4
6 South Phone 7
7 West Keyboard 7

Aggregate multiple functions

1
2
3
dataframe_for_aggregation.groupby("product").quantity.agg(
    ["mean", "sum", "count"]
).reset_index()
product mean sum count
0 Keyboard 5.333333 16 3
1 Laptop 4.666667 14 3
2 Monitor 4.000000 4 1
3 Mouse 1.000000 1 1
4 Phone 7.500000 15 2
Merging and Joining DataFrames
1
2
3
4
5
6
dataframe_for_join_a = pd.DataFrame(
    {"Key": ["A", "B", "C"], "Value": [1, 2, 3]}
)
dataframe_for_join_b = pd.DataFrame(
    {"Key": ["A", "B", "D"], "Value": [4, 5, 6]}
)

Merging inner

\(A \cap B\)

pd.merge(dataframe_for_join_a, dataframe_for_join_b, on="Key", how="inner")
Key Value_x Value_y
0 A 1 4
1 B 2 5

Merging outer

\(A \cup B\)

pd.merge(dataframe_for_join_a, dataframe_for_join_b, on="Key", how="outer")
Key Value_x Value_y
0 A 1.0 4.0
1 B 2.0 5.0
2 C 3.0 NaN
3 D NaN 6.0

Merging left

pd.merge(dataframe_for_join_a, dataframe_for_join_b, on="Key", how="left")
Key Value_x Value_y
0 A 1 4.0
1 B 2 5.0
2 C 3 NaN

Merging right

pd.merge(dataframe_for_join_a, dataframe_for_join_b, on="Key", how="right")
Key Value_x Value_y
0 A 1.0 4
1 B 2.0 5
2 D NaN 6