library(dplyr)
library(ggplot2)
library(janitor)
library(arrow)
Transforming data to make it fit for analysis
visualization
dplyr
pandas
siuba
data-transformation
mutate
filter
Learn to use dplyr, pandas and siuba to filter rows & create and change columns
import pandas as pd
from siuba.dply.vector import n, row_number, last
from siuba import *
from plotnine import *
import janitor
C:\Users\umair\ANACON~1\envs\homl3\lib\site-packages\lazy_loader\__init__.py:185: RuntimeWarning: subpackages can technically be lazily loaded, but it causes the package to be eagerly loaded even if it is already lazily loaded.So, you probably shouldn't use subpackages with this lazy feature.
C:\Users\umair\ANACON~1\envs\homl3\lib\site-packages\lazy_loader\__init__.py:185: RuntimeWarning: subpackages can technically be lazily loaded, but it causes the package to be eagerly loaded even if it is already lazily loaded.So, you probably shouldn't use subpackages with this lazy feature.
using ParquetFiles, DataFrames, Tidier
<- arrow::read_parquet("data/df_raw.parquet")
df
<- janitor::clean_names(df)
df
::kable(head(df)) knitr
vehicle_id | frame_id | total_frames | global_time | local_x | local_y | global_x | global_y | v_length | v_width | v_class | v_vel | v_acc | lane_id | preceding | following | space_headway | time_headway |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 12 | 884 | 1113433136100 | 16.884 | 52.74782 | 6042842 | 2133118 | 14.3 | 6.4 | 2 | 6.947704 | 2.188889 | 2 | 0 | 0 | 0 | 0 |
1 | 13 | 884 | 1113433136200 | 16.938 | 53.45500 | 6042842 | 2133119 | 14.3 | 6.4 | 2 | 7.195818 | 3.152102 | 2 | 0 | 0 | 0 | 0 |
1 | 14 | 884 | 1113433136300 | 16.991 | 54.19089 | 6042842 | 2133120 | 14.3 | 6.4 | 2 | 7.522104 | 3.871618 | 2 | 0 | 0 | 0 | 0 |
1 | 15 | 884 | 1113433136400 | 17.045 | 54.96182 | 6042842 | 2133121 | 14.3 | 6.4 | 2 | 7.896343 | 4.184370 | 2 | 0 | 0 | 0 | 0 |
1 | 16 | 884 | 1113433136500 | 17.098 | 55.77100 | 6042842 | 2133123 | 14.3 | 6.4 | 2 | 8.287402 | 3.906572 | 2 | 0 | 0 | 0 | 0 |
1 | 17 | 884 | 1113433136600 | 17.151 | 56.61935 | 6042842 | 2133124 | 14.3 | 6.4 | 2 | 8.679462 | 3.880959 | 2 | 0 | 0 | 0 | 0 |
= pd.read_parquet("data/df_raw.parquet")
df
df.columns
Index(['Vehicle_ID', 'Frame_ID', 'Total_Frames', 'Global_Time', 'Local_X',
'Local_Y', 'Global_X', 'Global_Y', 'v_Length', 'v_Width', 'v_Class',
'v_Vel', 'v_Acc', 'Lane_ID', 'Preceding', 'Following', 'Space_Headway',
'Time_Headway'],
dtype='object')
= df.clean_names()
df
df.columns
Index(['vehicle_id', 'frame_id', 'total_frames', 'global_time', 'local_x',
'local_y', 'global_x', 'global_y', 'v_length', 'v_width', 'v_class',
'v_vel', 'v_acc', 'lane_id', 'preceding', 'following', 'space_headway',
'time_headway'],
dtype='object')
= DataFrame(load("data/df_raw.parquet")) df
1048575×18 DataFrame
Row │ Vehicle_ID Frame_ID Total_Frames Global_Time Local_X Local_Y ⋯
│ Int64 Int64 Int64 Int64 Float64 Float64 ⋯
─────────┼──────────────────────────────────────────────────────────────────────
1 │ 1 12 884 1113433136100 16.884 52.74 ⋯
2 │ 1 13 884 1113433136200 16.938 53.45
3 │ 1 14 884 1113433136300 16.991 54.19
4 │ 1 15 884 1113433136400 17.045 54.96
5 │ 1 16 884 1113433136500 17.098 55.77 ⋯
6 │ 1 17 884 1113433136600 17.151 56.61
7 │ 1 18 884 1113433136700 17.204 57.50
8 │ 1 19 884 1113433136800 17.257 58.43
⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋱
1048569 │ 2911 8586 731 1113433993500 53.639 1510.93 ⋯
1048570 │ 2911 8587 731 1113433993600 53.666 1513.01
1048571 │ 2911 8588 731 1113433993700 53.693 1515.0
1048572 │ 2911 8589 731 1113433993800 53.719 1516.88
1048573 │ 2911 8590 731 1113433993900 53.746 1518.68 ⋯
1048574 │ 2911 8591 731 1113433994000 53.772 1520.39
1048575 │ 2911 8592 731 1113433994100 53.799 1522.03
13 columns and 1048560 rows omitted
= @chain df begin
df @clean_names
end
1048575×18 DataFrame
Row │ vehicle_i_d frame_i_d total_frames global_time local_x local ⋯
│ Int64 Int64 Int64 Int64 Float64 Float ⋯
─────────┼──────────────────────────────────────────────────────────────────────
1 │ 1 12 884 1113433136100 16.884 52. ⋯
2 │ 1 13 884 1113433136200 16.938 53.
3 │ 1 14 884 1113433136300 16.991 54.
4 │ 1 15 884 1113433136400 17.045 54.
5 │ 1 16 884 1113433136500 17.098 55. ⋯
6 │ 1 17 884 1113433136600 17.151 56.
7 │ 1 18 884 1113433136700 17.204 57.
8 │ 1 19 884 1113433136800 17.257 58.
⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋱
1048569 │ 2911 8586 731 1113433993500 53.639 1510. ⋯
1048570 │ 2911 8587 731 1113433993600 53.666 1513.
1048571 │ 2911 8588 731 1113433993700 53.693 1515.
1048572 │ 2911 8589 731 1113433993800 53.719 1516.
1048573 │ 2911 8590 731 1113433993900 53.746 1518. ⋯
1048574 │ 2911 8591 731 1113433994000 53.772 1520.
1048575 │ 2911 8592 731 1113433994100 53.799 1522.
13 columns and 1048560 rows omitted
names(df)
18-element Vector{String}:
"vehicle_i_d"
"frame_i_d"
"total_frames"
"global_time"
"local_x"
"local_y"
"global_x"
"global_y"
"v_length"
"v_width"
"v_class"
"v_vel"
"v_acc"
"lane_i_d"
"preceding"
"following"
"space_headway"
"time_headway"
= @chain df begin
df @rename(vehicle_id = vehicle_i_d,
= frame_i_d,
frame_id = lane_i_d)
lane_id end
1048575×18 DataFrame
Row │ vehicle_id frame_id total_frames global_time local_x local_y ⋯
│ Int64 Int64 Int64 Int64 Float64 Float64 ⋯
─────────┼──────────────────────────────────────────────────────────────────────
1 │ 1 12 884 1113433136100 16.884 52.74 ⋯
2 │ 1 13 884 1113433136200 16.938 53.45
3 │ 1 14 884 1113433136300 16.991 54.19
4 │ 1 15 884 1113433136400 17.045 54.96
5 │ 1 16 884 1113433136500 17.098 55.77 ⋯
6 │ 1 17 884 1113433136600 17.151 56.61
7 │ 1 18 884 1113433136700 17.204 57.50
8 │ 1 19 884 1113433136800 17.257 58.43
⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋱
1048569 │ 2911 8586 731 1113433993500 53.639 1510.93 ⋯
1048570 │ 2911 8587 731 1113433993600 53.666 1513.01
1048571 │ 2911 8588 731 1113433993700 53.693 1515.0
1048572 │ 2911 8589 731 1113433993800 53.719 1516.88
1048573 │ 2911 8590 731 1113433993900 53.746 1518.68 ⋯
1048574 │ 2911 8591 731 1113433994000 53.772 1520.39
1048575 │ 2911 8592 731 1113433994100 53.799 1522.03
13 columns and 1048560 rows omitted
<- df |>
df ::group_by(frame_id) |> # grouping by frame id
dplyr::mutate(preceding_local_y = local_y[match(preceding, vehicle_id)],
dplyrpreceding_length = v_length[match(preceding, vehicle_id)],
preceding_width = v_width[match(preceding, vehicle_id)],
preceding_class= v_class[match(preceding, vehicle_id)],
preceding_vel = v_vel[match(preceding, vehicle_id)],
preceding_acc = v_acc[match(preceding, vehicle_id)]) |>
::ungroup() dplyr
= df.merge(
df 'frame_id', 'vehicle_id', 'local_y', 'v_length',
df.loc[:, ['v_width', 'v_class', 'v_vel', 'v_acc']] ,
= ['frame_id', 'preceding'],
left_on = ['frame_id', 'vehicle_id'], how = 'left',
right_on =['', '_preceding']
suffixes
)
= df.drop(['vehicle_id_preceding'], axis = 'columns')
df
df.columns
Index(['vehicle_id', 'frame_id', 'total_frames', 'global_time', 'local_x',
'local_y', 'global_x', 'global_y', 'v_length', 'v_width', 'v_class',
'v_vel', 'v_acc', 'lane_id', 'preceding', 'following', 'space_headway',
'time_headway', 'local_y_preceding', 'v_length_preceding',
'v_width_preceding', 'v_class_preceding', 'v_vel_preceding',
'v_acc_preceding'],
dtype='object')
Rename the columns for preceding
vehicles to match the column names with R
:
= (
df
df >> rename(preceding_local_y = _.local_y_preceding,
= _.v_length_preceding,
preceding_v_length = _.v_width_preceding,
preceding_v_width = _.v_class_preceding,
preceding_v_class = _.v_vel_preceding,
preceding_v_vel = _.v_acc_preceding)
preceding_v_acc
)
df.columns
Index(['vehicle_id', 'frame_id', 'total_frames', 'global_time', 'local_x',
'local_y', 'global_x', 'global_y', 'v_length', 'v_width', 'v_class',
'v_vel', 'v_acc', 'lane_id', 'preceding', 'following', 'space_headway',
'time_headway', 'preceding_local_y', 'preceding_v_length',
'preceding_v_width', 'preceding_v_class', 'preceding_v_vel',
'preceding_v_acc'],
dtype='object')
= leftjoin(
df
df,:, [:frame_id, :vehicle_id, :local_y, :v_length, :v_width, :v_class, :v_vel, :v_acc]],
df[= [:frame_id => :frame_id, :preceding => :vehicle_id],
on = true,
makeunique = "" => "_preceding"
renamecols )
1048575×24 DataFrame
Row │ vehicle_id frame_id total_frames global_time local_x local_y ⋯
│ Int64 Int64 Int64 Int64 Float64 Float64 ⋯
─────────┼──────────────────────────────────────────────────────────────────────
1 │ 11 57 864 1113433140600 17.924 68.918 ⋯
2 │ 11 58 864 1113433140700 17.827 69.534
3 │ 11 59 864 1113433140800 17.73 70.158
4 │ 11 60 864 1113433140900 17.632 70.792
5 │ 11 61 864 1113433141000 17.533 71.434 ⋯
6 │ 11 62 864 1113433141100 17.433 72.083
7 │ 11 63 864 1113433141200 17.333 72.737
8 │ 11 64 864 1113433141300 17.233 73.398
⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋱
1048569 │ 2911 8381 731 1113433973000 61.556 928.234 ⋯
1048570 │ 2911 8382 731 1113433973100 60.848 931.051
1048571 │ 2911 8383 731 1113433973200 60.721 933.884
1048572 │ 2911 8384 731 1113433973300 60.667 936.733
1048573 │ 2911 8385 731 1113433973400 60.779 939.596 ⋯
1048574 │ 2911 8386 731 1113433973500 60.97 942.472
1048575 │ 2911 8387 731 1113433973600 60.618 945.36
19 columns and 1048560 rows omitted
names(df)
24-element Vector{String}:
"vehicle_id"
"frame_id"
"total_frames"
"global_time"
"local_x"
"local_y"
"global_x"
"global_y"
"v_length"
"v_width"
⋮
"following"
"space_headway"
"time_headway"
"local_y_preceding"
"v_length_preceding"
"v_width_preceding"
"v_class_preceding"
"v_vel_preceding"
"v_acc_preceding"
Rename the columns for preceding
vehicles to match the column names with R
:
= @chain df begin
df @rename(preceding_local_y = local_y_preceding,
= v_length_preceding,
preceding_v_length = v_width_preceding,
preceding_v_width = v_class_preceding,
preceding_v_class = v_vel_preceding,
preceding_v_vel = v_acc_preceding)
preceding_v_acc end
1048575×24 DataFrame
Row │ vehicle_id frame_id total_frames global_time local_x local_y ⋯
│ Int64 Int64 Int64 Int64 Float64 Float64 ⋯
─────────┼──────────────────────────────────────────────────────────────────────
1 │ 11 57 864 1113433140600 17.924 68.918 ⋯
2 │ 11 58 864 1113433140700 17.827 69.534
3 │ 11 59 864 1113433140800 17.73 70.158
4 │ 11 60 864 1113433140900 17.632 70.792
5 │ 11 61 864 1113433141000 17.533 71.434 ⋯
6 │ 11 62 864 1113433141100 17.433 72.083
7 │ 11 63 864 1113433141200 17.333 72.737
8 │ 11 64 864 1113433141300 17.233 73.398
⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋱
1048569 │ 2911 8381 731 1113433973000 61.556 928.234 ⋯
1048570 │ 2911 8382 731 1113433973100 60.848 931.051
1048571 │ 2911 8383 731 1113433973200 60.721 933.884
1048572 │ 2911 8384 731 1113433973300 60.667 936.733
1048573 │ 2911 8385 731 1113433973400 60.779 939.596 ⋯
1048574 │ 2911 8386 731 1113433973500 60.97 942.472
1048575 │ 2911 8387 731 1113433973600 60.618 945.36
19 columns and 1048560 rows omitted
names(df)
24-element Vector{String}:
"vehicle_id"
"frame_id"
"total_frames"
"global_time"
"local_x"
"local_y"
"global_x"
"global_y"
"v_length"
"v_width"
⋮
"following"
"space_headway"
"time_headway"
"preceding_local_y"
"preceding_v_length"
"preceding_v_width"
"preceding_v_class"
"preceding_v_vel"
"preceding_v_acc"
<- df |>
df select(-c(total_frames, starts_with("global"), following), global_time) |> # Keeping the desired columns only
mutate(across(
.cols = c(starts_with("local"), starts_with("v_"), space_headway, starts_with("preceding"), -preceding, -preceding_class, -v_class),
.fns = ~ round(.x * .3048, 2)
|> # Convert to metric units
)) mutate(across(
.cols = c(vehicle_id, v_class, lane_id, preceding, preceding_class),
.fns = ~ as.factor(.x)
# Convert a few columns to factor type ))
= (
df
df >> select(-_[_.total_frames, _.startswith('global'), _.following],
# Keeping the desired columns only
_.global_time)
)
### converting to metric units
= ['local_x', 'local_y', 'v_length', 'v_width',
cols_to_convert_to_metric 'v_vel', 'v_acc', 'space_headway', 'preceding_local_y',
'preceding_v_length', 'preceding_v_width', 'preceding_v_vel',
'preceding_v_acc']
= df[cols_to_convert_to_metric].apply(lambda x: x * .3048).round(2)
df[cols_to_convert_to_metric]
## change the data type to categorical
= ['vehicle_id', 'v_class', 'lane_id',
cols_to_convert_to_categorical 'preceding', 'preceding_v_class']
for col in cols_to_convert_to_categorical:
= df[col].astype('str',copy=False) df[col]
<- df |>
df mutate(actual_time = as.POSIXct(global_time/1000, origin = "1970-01-01",
tz="America/Los_Angeles")) |>
select(-global_time)
'actual_time'] = pd.to_datetime(df['global_time'] / 1000,
df[='s', origin='1970-01-01', utc=True)
unit'actual_time'] = df['actual_time'].dt.tz_convert('America/Los_Angeles')
df[
= df.drop(['global_time'], axis = 'columns')
df
print(df.columns)
Index(['vehicle_id', 'frame_id', 'local_x', 'local_y', 'v_length', 'v_width',
'v_class', 'v_vel', 'v_acc', 'lane_id', 'preceding', 'space_headway',
'time_headway', 'preceding_local_y', 'preceding_v_length',
'preceding_v_width', 'preceding_v_class', 'preceding_v_vel',
'preceding_v_acc', 'actual_time'],
dtype='object')
## Time space diagram
# ggplot(data = df |>
# filter(lane_id %in% c(1,2,3))) +
# geom_path(aes(x = actual_time,
# y = local_y,
# color = v_vel,
# group = vehicle_id), alpha = 0.5) +
# scale_color_gradient(low = "red", high = "green") +
# facet_grid(lane_id ~ ., labeller = "label_both") +
# labs(x = "Time (HH:MM)",
# y = "Longitudinal position",
# color = "Speed (m/s)") +
# theme_minimal()
## Time space diagram
= (
df_123
df >> filter(_.lane_id.isin(["1", "2", "3"]))
)
C:\Users\umair\ANACON~1\envs\homl3\lib\site-packages\pandas\core\algorithms.py:522: DeprecationWarning: np.find_common_type is deprecated. Please use `np.result_type` or `np.promote_types`.
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information. (Deprecated NumPy 1.25)
(= df_123) +
ggplot(data = 'actual_time',
geom_path(aes(x = 'local_y',
y = 'v_vel',
color = 'vehicle_id'), alpha = 0.5) +
group = "red", high = "green") +
scale_color_gradient(low 'lane_id ~ .', labeller = "label_both") +
facet_grid(= "Time (HH:MM)",
labs(x = "Longitudinal position",
y = "Speed (m/s)") +
color
theme_minimal() )
<Figure Size: (640 x 480)>