Pandas¶

Pandas is a data manipulation library in python, used for data analysis and cleaning.

It provides two primary structures:

Series: one-dimension array like object.
DataFrames: two-dimensional, size-mutable tabular data structure.

Install packages¶

You can use both pip or astral/uv.

!uv pip install -q \
    pandas==2.3.2 \
    pandas-stubs==2.3.2.250827 \
    numpy==2.3.0

Import packages¶

from datetime import datetime
from io import StringIO

import numpy as np
import pandas as pd

Usage¶

Series¶

Create series from List¶

data_from_list = [1, 2, 3, 4, 5]

series_from_list = pd.Series(data_from_list)

print(f"Series:\n{series_from_list}")
print(type(series_from_list))

Series:

0 1

1 2

2 3

3 4

4 5

dtype: int64

Create Series from Dict¶

data_from_dict = {"a": 1, "b": 2}

series_from_dict = pd.Series(data_from_dict)

print(f"Series:\n{series_from_dict}")

Series:

a 1

b 2

dtype: int64

Working with Series indexes¶

data_from_series_with_index = [10, 20, 30]
series_index = ["a", "b", "c"]

series_from_data_with_index = pd.Series(
    data_from_series_with_index, index=series_index
)

print(f"Series:\n{series_from_data_with_index}")

Series:

a 10

b 20

c 30

dtype: int64

Data frames¶

Loading Data¶

Data frame from a dictionary of lists¶

data_from_dict_of_lists = {
    "Name": ["Pedro", "James", "John"],
    "Age": [33, 27, 52],
}

dataframe_dict_of_lists = pd.DataFrame(data_from_dict_of_lists)
dataframe_dict_of_lists

	Name	Age
0	Pedro	33
1	James	27
2	John	52

Data frame from a list of dicts¶

data_from_list_of_dicts = [
    {"Name": "Pedro", "Age": 33},
    {"Name": "James", "Age": 27},
    {"Name": "John", "Age": 52},
]

dataframe_from_list_of_dicts = pd.DataFrame(data_from_list_of_dicts)
dataframe_from_list_of_dicts

	Name	Age
0	Pedro	33
1	James	27
2	John	52

Data frame from json¶

From json string

data_from_json_string = '[{"Name": "Pedro", "Age": 33}]'

dataframe_from_json_string = pd.read_json(StringIO(data_from_json_string))
dataframe_from_json_string

	Name	Age
0	Pedro	33

Convert DataFrame back to json

dataframe_from_json_string.to_json()

'{"Name":{"0":"Pedro"},"Age":{"0":33}}'

dataframe_from_json_string.to_json(orient="index")

'{"0":{"Name":"Pedro","Age":33}}'

dataframe_from_json_string.to_json(orient="records")

'[{"Name":"Pedro","Age":33}]'

Dataframe attributes¶

dataframe_from_json_string.dtypes

Name object Age int64 dtype: object

Dataframe methods¶

DataFrame.head¶

data_for_methods_examples = [
    {"Name": "Pedro", "Age": 33},
    {"Name": "James", "Age": 27},
    {"Name": "John", "Age": 52},
]

dataframe_for_methods_examples = pd.DataFrame(data_for_methods_examples)
dataframe_for_methods_examples.head(2)

	Name	Age
0	Pedro	33
1	James	27

DataFrame.info

dataframe_for_methods_examples.info()

RangeIndex: 3 entries, 0 to 2

Data columns (total 2 columns):

# Column Non-Null Count Dtype

--- ------ -------------- -----

0 Name 3 non-null object

1 Age 3 non-null int64

dtypes: int64(1), object(1)

memory usage: 180.0+ bytes

DataFrame.describe¶

dataframe_for_methods_examples.describe()

	Age
count	3.000000
mean	37.333333
std	13.051181
min	27.000000
25%	30.000000
50%	33.000000
75%	42.500000
max	52.000000

DataFrame.tail¶

dataframe_for_methods_examples.tail(2)

	Name	Age
1	James	27
2	John	52

Renaming columns¶

dataframe_for_methods_examples.rename({"Name": "First Name"})

	Name	Age
0	Pedro	33
1	James	27
2	John	52

Accessing a Series¶

print(dataframe_for_methods_examples["Name"])
print(type(dataframe_for_methods_examples["Name"]))

0 Pedro

1 James

2 John

Name: Name, dtype: object

Accessing an row index¶

dataframe_for_methods_examples.loc[0]

Name Pedro Age 33 Name: 0, dtype: object

Accessing a particular value¶

dataframe_for_methods_examples.at[1, "Age"]

np.int64(27)

dataframe_for_methods_examples.iat[2, 1]

np.int64(52)

Dataframe manipulation¶

data_for_manipulation = [
    {"Name": "Pedro", "Age": 33},
    {"Name": "James", "Age": 27},
    {"Name": "John", "Age": 52},
]

dataframe_for_manipulation = pd.DataFrame(data_for_manipulation)
dataframe_for_manipulation.head(2)

	Name	Age
0	Pedro	33
1	James	27

Adding a new column¶

dataframe_for_manipulation["City"] = ["New York", "Florida", "Los Angeles"]
dataframe_for_manipulation

	Name	Age	City
0	Pedro	33	New York
1	James	27	Florida
2	John	52	Los Angeles

Removing a column¶

dataframe_for_manipulation.drop("City", axis=1)  # axis=0 rows, axis=1 columns

	Name	Age
0	Pedro	33
1	James	27
2	John	52

Removing a row¶

dataframe_for_manipulation.drop(1, axis=0)

	Name	Age	City
0	Pedro	33	New York
2	John	52	Los Angeles

Modify column value¶

dataframe_for_manipulation["Age"] = dataframe_for_manipulation["Age"] + 1
dataframe_for_manipulation

	Name	Age	City
0	Pedro	34	New York
1	James	28	Florida
2	John	53	Los Angeles

Handling missing values¶

data_with_missing = [
    {"Name": "Pedro", "Age": 33},
    {"Name": "James", "Age": None},
    {"Name": "John", "Age": 52},
]

dataframe_with_missing = pd.DataFrame(data_with_missing)

dataframe_with_missing.isnull().any()

Name False Age True dtype: bool

dataframe_with_missing.isnull().sum()

Name 0 Age 1 dtype: int64

dataframe_with_missing[dataframe_with_missing.isnull().any(axis=1)]

	Name	Age
1	James	NaN

dataframe_with_missing.fillna(0)

	Name	Age
0	Pedro	33.0
1	James	0.0
2	John	52.0

dataframe_with_missing["Age_fill_NA"] = dataframe_with_missing["Age"].fillna(
    dataframe_with_missing["Age"].mean()
)
dataframe_with_missing

	Name	Age	Age_fill_NA
0	Pedro	33.0	33.0
1	James	NaN	42.5
2	John	52.0	52.0

Data manipulation¶

Casting¶

data_for_casting = [
    {"Name": "Pedro", "Age": 33.0},
    {"Name": "James", "Age": 27.0},
    {"Name": "John", "Age": 52.0},
]

dataframe_for_casting = pd.DataFrame(data_for_casting)

dataframe_for_casting.head()

	Name	Age
0	Pedro	33.0
1	James	27.0
2	John	52.0

Casting a DataFrame Series to integer

dataframe_for_casting["Age"] = dataframe_for_casting["Age"].astype(int)
dataframe_for_casting

	Name	Age
0	Pedro	33
1	James	27
2	John	52

Applying function on column¶

Applying a lambda function to a DataFrame Series

current_year = datetime.now().year

dataframe_for_casting["BirthYear"] = dataframe_for_casting["Age"].apply(
    lambda x: current_year - x
)

dataframe_for_casting.head()

	Name	Age	BirthYear
0	Pedro	33	1992
1	James	27	1998
2	John	52	1973

Aggregation¶

Make a mock dataset

size = 10

start_date = pd.to_datetime("2023-01-01")

data_for_aggregation = {
    "order_id": np.arange(1, size + 1),
    "product": np.random.choice(
        ["Phone", "Laptop", "Keyboard", "Mouse", "Monitor"], size=size
    ),
    "region": np.random.choice(["North", "South", "East", "West"], size=size),
    "price": np.random.randint(100, 2000, size=size),
    "quantity": np.random.randint(1, 10, size=size),
    "order_date": start_date
    + pd.to_timedelta(np.random.randint(0, 365, size=size), unit="D"),
}

dataframe_for_aggregation = pd.DataFrame(data_for_aggregation)

dataframe_for_aggregation.head(10)

	order_id	product	region	price	quantity	order_date
0	1	Laptop	North	655	5	2023-05-11
1	2	Keyboard	East	892	9	2023-05-11
2	3	Phone	South	1092	7	2023-06-23
3	4	Keyboard	West	1864	5	2023-05-08
4	5	Phone	East	1287	8	2023-06-08
5	6	Mouse	North	1417	1	2023-02-09
6	7	Keyboard	West	1299	2	2023-08-18
7	8	Monitor	South	282	4	2023-07-23
8	9	Laptop	East	236	8	2023-01-04
9	10	Laptop	East	844	1	2023-06-11

Group by a column and sum

dataframe_for_aggregation.groupby("product").quantity.sum().reset_index()

	product	quantity
0	Keyboard	16
1	Laptop	14
2	Monitor	4
3	Mouse	1
4	Phone	15

Group by multiple columns and sum

dataframe_for_aggregation.groupby(
    ["region", "product"]
).quantity.sum().reset_index()

	region	product	quantity
0	East	Keyboard	9
1	East	Laptop	9
2	East	Phone	8
3	North	Laptop	5
4	North	Mouse	1
5	South	Monitor	4
6	South	Phone	7
7	West	Keyboard	7

Aggregate multiple functions

dataframe_for_aggregation.groupby("product").quantity.agg(
    ["mean", "sum", "count"]
).reset_index()

	product	mean	sum	count
0	Keyboard	5.333333	16	3
1	Laptop	4.666667	14	3
2	Monitor	4.000000	4	1
3	Mouse	1.000000	1	1
4	Phone	7.500000	15	2

Merging and Joining DataFrames¶

dataframe_for_join_a = pd.DataFrame(
    {"Key": ["A", "B", "C"], "Value": [1, 2, 3]}
)
dataframe_for_join_b = pd.DataFrame(
    {"Key": ["A", "B", "D"], "Value": [4, 5, 6]}
)

Merging inner

\(A \cap B\)

pd.merge(dataframe_for_join_a, dataframe_for_join_b, on="Key", how="inner")

	Key	Value_x	Value_y
0	A	1	4
1	B	2	5

Merging outer

\(A \cup B\)

pd.merge(dataframe_for_join_a, dataframe_for_join_b, on="Key", how="outer")

	Key	Value_x	Value_y
0	A	1.0	4.0
1	B	2.0	5.0
2	C	3.0	NaN
3	D	NaN	6.0

Merging left

pd.merge(dataframe_for_join_a, dataframe_for_join_b, on="Key", how="left")

	Key	Value_x	Value_y
0	A	1	4.0
1	B	2	5.0
2	C	3	NaN

Merging right

pd.merge(dataframe_for_join_a, dataframe_for_join_b, on="Key", how="right")

	Key	Value_x	Value_y
0	A	1.0	4
1	B	2.0	5
2	D	NaN	6