Pandas
Pandas is a data manipulation library in python, used for data analysis and cleaning.
It provides two primary structures:
- Series: one-dimension array like object.
- DataFrames: two-dimensional, size-mutable tabular data structure.
Install packages
| !uv pip install -q \
pandas==2.3.2 \
pandas-stubs==2.3.2.250827 \
numpy==2.3.0
|
Import packages
| import sqlite3
from datetime import datetime
from io import StringIO
import numpy as np
import pandas as pd
|
Series
Create series from List
| data_from_list = [1, 2, 3, 4, 5]
series_from_list = pd.Series(data_from_list)
print(f"Series:\n{series_from_list}")
print(type(series_from_list))
|
Series:
0 1
1 2
2 3
3 4
4 5
dtype: int64
<class 'pandas.core.series.Series'>
Create Series from Dict
| data_from_dict = {"a": 1, "b": 2}
series_from_dict = pd.Series(data_from_dict)
print(f"Series:\n{series_from_dict}")
|
Series:
a 1
b 2
dtype: int64
Working with Series indexes
| data_from_series_with_index = [10, 20, 30]
series_index = ["a", "b", "c"]
series_from_data_with_index = pd.Series(
data_from_series_with_index, index=series_index
)
print(f"Series:\n{series_from_data_with_index}")
|
Series:
a 10
b 20
c 30
dtype: int64
Data frames
Creating DataFrames
DataFrame from a list of lists
| data_from_list_of_lists = [
[1, "Alice", 25],
[2, "Bob", 30],
[3, "Charlie", 22],
]
columns_for_list_of_lists = ["ID", "Name", "Age"]
df_from_list_of_lists = pd.DataFrame(
data_from_list_of_lists,
columns=columns_for_list_of_lists,
)
df_from_list_of_lists
|
|
ID |
Name |
Age |
| 0 |
1 |
Alice |
25 |
| 1 |
2 |
Bob |
30 |
| 2 |
3 |
Charlie |
22 |
DataFrame from a dictionary of lists
| data_from_dict_of_lists = {
"Name": ["Pedro", "James", "John"],
"Age": [33, 27, 52],
}
dataframe_dict_of_lists = pd.DataFrame(data_from_dict_of_lists)
dataframe_dict_of_lists
|
|
Name |
Age |
| 0 |
Pedro |
33 |
| 1 |
James |
27 |
| 2 |
John |
52 |
DataFrame from a list of dicts
| data_from_list_of_dicts = [
{"Name": "Pedro", "Age": 33},
{"Name": "James", "Age": 27},
{"Name": "John", "Age": 52},
]
dataframe_from_list_of_dicts = pd.DataFrame(data_from_list_of_dicts)
dataframe_from_list_of_dicts
|
|
Name |
Age |
| 0 |
Pedro |
33 |
| 1 |
James |
27 |
| 2 |
John |
52 |
DataFrame from json string
| data_from_json_string = '[{"Name": "Pedro", "Age": 33}]'
dataframe_from_json_string = pd.read_json(StringIO(data_from_json_string))
dataframe_from_json_string
|
Convert DataFrame back to json
| dataframe_from_json_string.to_json()
|
'{"Name":{"0":"Pedro"},"Age":{"0":33}}'
Change DataFrame orientation to index
| dataframe_from_json_string.to_json(orient="index")
|
'{"0":{"Name":"Pedro","Age":33}}'
Change DataFrame orientation to records
| dataframe_from_json_string.to_json(orient="records")
|
'[{"Name":"Pedro","Age":33}]'
Dataframe data types
| dataframe_from_json_string.dtypes
|
Name object
Age int64
dtype: object
First rows of a DataFrame
| data_for_methods_examples = [
{"Name": "Pedro", "Age": 33},
{"Name": "James", "Age": 27},
{"Name": "John", "Age": 52},
]
dataframe_for_methods_examples = pd.DataFrame(data_for_methods_examples)
dataframe_for_methods_examples.head(n=2)
|
|
Name |
Age |
| 0 |
Pedro |
33 |
| 1 |
James |
27 |
Information about a DataFrame
| dataframe_for_methods_examples.info()
|
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Name 3 non-null object
1 Age 3 non-null int64
dtypes: int64(1), object(1)
memory usage: 180.0+ bytes
Describing a DataFrame
| dataframe_for_methods_examples.describe()
|
|
Age |
| count |
3.000000 |
| mean |
37.333333 |
| std |
13.051181 |
| min |
27.000000 |
| 25% |
30.000000 |
| 50% |
33.000000 |
| 75% |
42.500000 |
| max |
52.000000 |
Last rows of a DataFrame
| dataframe_for_methods_examples.tail(2)
|
|
Name |
Age |
| 1 |
James |
27 |
| 2 |
John |
52 |
Renaming columns in a DataFrame
| dataframe_for_methods_examples.rename({"Name": "First Name"})
|
|
Name |
Age |
| 0 |
Pedro |
33 |
| 1 |
James |
27 |
| 2 |
John |
52 |
Accessing DataFrame elements
Accessing a DataFrame Serie
| print(dataframe_for_methods_examples["Name"])
print(type(dataframe_for_methods_examples["Name"]))
|
0 Pedro
1 James
2 John
Name: Name, dtype: object
<class 'pandas.core.series.Series'>
DataFrame Index
| data_for_index_usage = [
[1, "Alice", 25],
[2, "Bob", 30],
[3, "Charlie", 22],
]
columns_for_index_usage = ["ID", "Name", "Age"]
dataframe_for_index_usage = pd.DataFrame(
data_for_index_usage, columns=columns_for_index_usage
)
dataframe_for_index_usage.head(n=2)
|
|
ID |
Name |
Age |
| 0 |
1 |
Alice |
25 |
| 1 |
2 |
Bob |
30 |
| dataframe_for_index_usage.index
|
RangeIndex(start=0, stop=3, step=1)
Replacing index
| dataframe_for_index_usage.index = ["a", "b", "c"]
dataframe_for_index_usage.head(n=2)
|
|
ID |
Name |
Age |
| a |
1 |
Alice |
25 |
| b |
2 |
Bob |
30 |
Accessing a DataFrame row index
| dataframe_for_index_usage.loc["a"]
|
ID 1
Name Alice
Age 25
Name: a, dtype: object
Accessing a DataFrame by position with iloc
| dataframe_for_index_usage.iloc[1]
|
ID 2
Name Bob
Age 30
Name: b, dtype: object
Accessing a value from a specific row/column at index with at
| dataframe_for_index_usage.at["a", "Age"]
|
Accessing a value from a specific row/column at position with iat
| dataframe_for_index_usage.iat[2, 1]
|
Resetting index
| dataframe_for_index_usage.reset_index()
|
|
index |
ID |
Name |
Age |
| 0 |
a |
1 |
Alice |
25 |
| 1 |
b |
2 |
Bob |
30 |
| 2 |
c |
3 |
Charlie |
22 |
Modifying DataFrames
| data_for_manipulation = [
{"Name": "Pedro", "Age": 33},
{"Name": "James", "Age": 27},
{"Name": "John", "Age": 52},
]
dataframe_for_manipulation = pd.DataFrame(data_for_manipulation)
dataframe_for_manipulation.head(2)
|
|
Name |
Age |
| 0 |
Pedro |
33 |
| 1 |
James |
27 |
Adding a new column to a DataFrame
| dataframe_for_manipulation["City"] = ["New York", "Florida", "Los Angeles"]
dataframe_for_manipulation
|
|
Name |
Age |
City |
| 0 |
Pedro |
33 |
New York |
| 1 |
James |
27 |
Florida |
| 2 |
John |
52 |
Los Angeles |
Dropping a column, returns a new DataFrame unless inplace=True
| dataframe_for_manipulation.drop("City", axis=1) # axis=0 rows, axis=1 columns
|
|
Name |
Age |
| 0 |
Pedro |
33 |
| 1 |
James |
27 |
| 2 |
John |
52 |
Deleting a column, modifies the original DataFrame
| del dataframe_for_manipulation["City"]
|
Removing a row
| dataframe_for_manipulation.drop(1, axis=0)
|
|
Name |
Age |
| 0 |
Pedro |
33 |
| 2 |
John |
52 |
Element-wise operations on DataFrames
| dataframe_for_manipulation["Age"] = dataframe_for_manipulation["Age"] + 1
dataframe_for_manipulation
|
|
Name |
Age |
| 0 |
Pedro |
34 |
| 1 |
James |
28 |
| 2 |
John |
53 |
Filtering DataFrames
| dataframe_for_filtering = pd.DataFrame(
[
{"Name": "Pedro", "Age": 33},
{"Name": "James", "Age": 27},
{"Name": "John", "Age": 52},
{"Name": "Alice", "Age": 30},
{"Name": "Bob", "Age": 22},
]
)
|
Filter DataFrame based on a condition
| dataframe_for_filtering[
(dataframe_for_filtering["Age"] > 30)
& (dataframe_for_filtering["Name"].str.startswith("J"))
]
|
String operations on DataFrames
| dataframe_for_string_operations = pd.DataFrame(
{
"Name": ["Pedro", "James", "John"],
"City": ["New York", "Florida", "Los Angeles"],
}
)
dataframe_for_string_operations["Name"].str.lower()
|
0 pedro
1 james
2 john
Name: Name, dtype: object
| dataframe_for_string_operations["City"].str.replace(" ", "_").str.lower()
|
0 new_york
1 florida
2 los_angeles
Name: City, dtype: object
Handling missing data
Handling missing values
| data_with_missing = [
{"Name": "Pedro", "Age": 33},
{"Name": "James", "Age": None},
{"Name": "John", "Age": 52},
]
dataframe_with_missing = pd.DataFrame(data_with_missing)
dataframe_with_missing.isnull().any()
|
Name False
Age True
dtype: bool
| dataframe_with_missing.isnull().sum()
|
Name 0
Age 1
dtype: int64
| dataframe_with_missing[dataframe_with_missing.isnull().any(axis=1)]
|
| dataframe_with_missing.fillna(0)
|
|
Name |
Age |
| 0 |
Pedro |
33.0 |
| 1 |
James |
0.0 |
| 2 |
John |
52.0 |
| dataframe_with_missing["Age_fill_NA"] = dataframe_with_missing["Age"].fillna(
dataframe_with_missing["Age"].mean()
)
dataframe_with_missing
|
|
Name |
Age |
Age_fill_NA |
| 0 |
Pedro |
33.0 |
33.0 |
| 1 |
James |
NaN |
42.5 |
| 2 |
John |
52.0 |
52.0 |
Casting Data
| data_for_casting = [
{"Name": "Pedro", "Age": 33.0},
{"Name": "James", "Age": 27.0},
{"Name": "John", "Age": 52.0},
]
dataframe_for_casting = pd.DataFrame(data_for_casting)
dataframe_for_casting.head()
|
|
Name |
Age |
| 0 |
Pedro |
33.0 |
| 1 |
James |
27.0 |
| 2 |
John |
52.0 |
Casting a DataFrame Series to integer
| dataframe_for_casting["Age"] = dataframe_for_casting["Age"].astype(int)
dataframe_for_casting
|
|
Name |
Age |
| 0 |
Pedro |
33 |
| 1 |
James |
27 |
| 2 |
John |
52 |
Applying function on column
Applying a lambda function to a DataFrame Series
| current_year = datetime.now().year
dataframe_for_casting["BirthYear"] = dataframe_for_casting["Age"].apply(
lambda x: current_year - x
)
dataframe_for_casting.head()
|
|
Name |
Age |
BirthYear |
| 0 |
Pedro |
33 |
1992 |
| 1 |
James |
27 |
1998 |
| 2 |
John |
52 |
1973 |
Summarizing Operations
| size = 10
start_date = pd.to_datetime("2023-01-01")
data_for_summarization = {
"order_id": np.arange(1, size + 1),
"product": np.random.choice(
["Phone", "Laptop", "Keyboard", "Mouse", "Monitor"], size=size
),
"region": np.random.choice(["North", "South", "East", "West"], size=size),
"price": np.random.randint(100, 2000, size=size),
"quantity": np.random.randint(1, 10, size=size),
"order_date": start_date
+ pd.to_timedelta(np.random.randint(0, 365, size=size), unit="D"),
}
dataframe_for_summarization = pd.DataFrame(data_for_summarization)
dataframe_for_summarization.head(10)
|
|
order_id |
product |
region |
price |
quantity |
order_date |
| 0 |
1 |
Laptop |
South |
1913 |
9 |
2023-08-10 |
| 1 |
2 |
Phone |
East |
1858 |
9 |
2023-07-16 |
| 2 |
3 |
Monitor |
South |
229 |
3 |
2023-04-26 |
| 3 |
4 |
Keyboard |
North |
1125 |
1 |
2023-11-19 |
| 4 |
5 |
Laptop |
East |
688 |
2 |
2023-03-10 |
| 5 |
6 |
Laptop |
West |
482 |
4 |
2023-11-06 |
| 6 |
7 |
Monitor |
East |
1371 |
7 |
2023-06-12 |
| 7 |
8 |
Mouse |
West |
800 |
1 |
2023-07-19 |
| 8 |
9 |
Mouse |
South |
436 |
8 |
2023-07-17 |
| 9 |
10 |
Laptop |
South |
369 |
8 |
2023-07-10 |
Summing a DataFrame Series
| dataframe_for_summarization.quantity.sum()
|
Descriptive statistics of a DataFrame Series
| dataframe_for_summarization.quantity.describe()
|
count 10.00000
mean 5.20000
std 3.32666
min 1.00000
25% 2.25000
50% 5.50000
75% 8.00000
max 9.00000
Name: quantity, dtype: float64
Finding unique values in a DataFrame Series
| dataframe_for_summarization["product"].unique()
|
array(['Laptop', 'Phone', 'Monitor', 'Keyboard', 'Mouse'], dtype=object)
Finding number of unique values in a DataFrame Series
| dataframe_for_summarization["product"].nunique()
|
Grouping
Make a mock dataset
| size = 10
start_date = pd.to_datetime("2023-01-01")
data_for_aggregation = {
"order_id": np.arange(1, size + 1),
"product": np.random.choice(
["Phone", "Laptop", "Keyboard", "Mouse", "Monitor"], size=size
),
"region": np.random.choice(["North", "South", "East", "West"], size=size),
"price": np.random.randint(100, 2000, size=size),
"quantity": np.random.randint(1, 10, size=size),
"order_date": start_date
+ pd.to_timedelta(np.random.randint(0, 365, size=size), unit="D"),
}
dataframe_for_aggregation = pd.DataFrame(data_for_aggregation)
dataframe_for_aggregation.head(10)
|
|
order_id |
product |
region |
price |
quantity |
order_date |
| 0 |
1 |
Phone |
East |
1255 |
5 |
2023-02-11 |
| 1 |
2 |
Keyboard |
South |
1054 |
5 |
2023-04-15 |
| 2 |
3 |
Keyboard |
South |
133 |
3 |
2023-07-23 |
| 3 |
4 |
Mouse |
East |
1983 |
1 |
2023-09-11 |
| 4 |
5 |
Mouse |
North |
958 |
6 |
2023-10-30 |
| 5 |
6 |
Keyboard |
West |
456 |
1 |
2023-01-15 |
| 6 |
7 |
Phone |
South |
297 |
3 |
2023-12-24 |
| 7 |
8 |
Phone |
West |
1787 |
8 |
2023-09-04 |
| 8 |
9 |
Laptop |
East |
1978 |
7 |
2023-10-29 |
| 9 |
10 |
Monitor |
South |
1482 |
6 |
2023-07-07 |
Group by a column and sum
| dataframe_for_aggregation.groupby("product").quantity.sum().reset_index()
|
|
product |
quantity |
| 0 |
Keyboard |
9 |
| 1 |
Laptop |
7 |
| 2 |
Monitor |
6 |
| 3 |
Mouse |
7 |
| 4 |
Phone |
16 |
Group by multiple columns and sum
| dataframe_for_aggregation.groupby(
["region", "product"]
).quantity.sum().reset_index()
|
|
region |
product |
quantity |
| 0 |
East |
Laptop |
7 |
| 1 |
East |
Mouse |
1 |
| 2 |
East |
Phone |
5 |
| 3 |
North |
Mouse |
6 |
| 4 |
South |
Keyboard |
8 |
| 5 |
South |
Monitor |
6 |
| 6 |
South |
Phone |
3 |
| 7 |
West |
Keyboard |
1 |
| 8 |
West |
Phone |
8 |
Aggregate multiple functions
| dataframe_for_aggregation.groupby("product").quantity.agg(
["mean", "sum", "count"]
).reset_index()
|
|
product |
mean |
sum |
count |
| 0 |
Keyboard |
3.000000 |
9 |
3 |
| 1 |
Laptop |
7.000000 |
7 |
1 |
| 2 |
Monitor |
6.000000 |
6 |
1 |
| 3 |
Mouse |
3.500000 |
7 |
2 |
| 4 |
Phone |
5.333333 |
16 |
3 |
Merging and Joining DataFrames
| dataframe_for_join_a = pd.DataFrame(
{"Key": ["A", "B", "C"], "Value": [1, 2, 3]}
)
dataframe_for_join_b = pd.DataFrame(
{"Key": ["A", "B", "D"], "Value": [4, 5, 6]}
)
|
Merging inner
\(A \cap B\)
| pd.merge(dataframe_for_join_a, dataframe_for_join_b, on="Key", how="inner")
|
|
Key |
Value_x |
Value_y |
| 0 |
A |
1 |
4 |
| 1 |
B |
2 |
5 |
Merging outer
\(A \cup B\)
| pd.merge(dataframe_for_join_a, dataframe_for_join_b, on="Key", how="outer")
|
|
Key |
Value_x |
Value_y |
| 0 |
A |
1.0 |
4.0 |
| 1 |
B |
2.0 |
5.0 |
| 2 |
C |
3.0 |
NaN |
| 3 |
D |
NaN |
6.0 |
Merging left
| pd.merge(dataframe_for_join_a, dataframe_for_join_b, on="Key", how="left")
|
|
Key |
Value_x |
Value_y |
| 0 |
A |
1 |
4.0 |
| 1 |
B |
2 |
5.0 |
| 2 |
C |
3 |
NaN |
Merging right
| pd.merge(dataframe_for_join_a, dataframe_for_join_b, on="Key", how="right")
|
|
Key |
Value_x |
Value_y |
| 0 |
A |
1.0 |
4 |
| 1 |
B |
2.0 |
5 |
| 2 |
D |
NaN |
6 |
Reading the result of a SQL query into a DataFrame
| dataframe_for_sqlite_queries = pd.DataFrame(
[[1, "Alice"], [2, "Bob"], [3, "Charlie"], [4, "Diana"], [5, "Ethan"]],
columns=["ID", "Name"],
)
sqlite_database_for_pandas_queries = sqlite3.connect(":memory:")
dataframe_for_sqlite_queries.to_sql(
"users",
sqlite_database_for_pandas_queries,
index=False,
if_exists="replace",
)
dataframe_from_sql_query = pd.read_sql_query(
"SELECT * FROM users WHERE ID <= 3", sqlite_database_for_pandas_queries
)
sqlite_database_for_pandas_queries.close()
dataframe_from_sql_query
|
|
ID |
Name |
| 0 |
1 |
Alice |
| 1 |
2 |
Bob |
| 2 |
3 |
Charlie |