To obtain the model summaries, first you need to obtain the 2D
hexagonal bin centroids which is the 2D model
(df_bin_centroids
) and the high-dimensional coordinates of
bin centroids which represents the lifted model in high-dimensions
(df_bin
).
r2 <- diff(range(s_curve_noise_umap$UMAP2))/diff(range(s_curve_noise_umap$UMAP1))
model <- fit_highd_model(training_data = s_curve_noise_training,
emb_df = s_curve_noise_umap_scaled,
bin1 = 6, r2 = r2,
col_start_highd = "x")
df_bin_centroids <- model$df_bin_centroids
df_bin <- model$df_bin
Let’s first compute the model summaries for training data. To do that, you need to predict 2D embedding for the training data.
pred_df_training <- predict_emb(test_data = s_curve_noise_training,
df_bin_centroids = df_bin_centroids,
df_bin = df_bin, type_NLDR = "UMAP")
glimpse(pred_df_training)
#> Rows: 75
#> Columns: 4
#> $ pred_UMAP_1 <dbl> 0.01168629, 0.79349029, 0.90517658, 0.23505886, 0.23505886…
#> $ pred_UMAP_2 <dbl> 0.37839752, 0.95873649, 1.15218282, -0.00849512, 0.3783975…
#> $ ID <int> 1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 19, 20, 21…
#> $ pred_hb_id <int> 19, 41, 47, 8, 20, 77, 41, 34, 41, 71, 34, 19, 66, 14, 34,…
s_curve_noise_umap_scaled |>
ggplot(aes(x = UMAP1,
y = UMAP2,
label = ID))+
geom_point(alpha=0.5) +
geom_point(data = pred_df_training, aes(x = pred_UMAP_1, y = pred_UMAP_2),
color = "red", alpha=0.5) +
coord_equal() +
theme(plot.title = element_text(hjust = 0.5, size = 18, face = "bold"),
axis.text = element_text(size = 5),
axis.title = element_text(size = 7))
Next, you can use the generated 2D embedding to compute model summaries.
glance(df_bin_centroids = df_bin_centroids, df_bin = df_bin,
training_data = s_curve_noise_training, newdata = NULL,
type_NLDR = "UMAP", col_start = "x")
#> # A tibble: 1 × 2
#> Error MSE
#> <dbl> <dbl>
#> 1 39.4 0.127
To obtain a dataframe with the high-dimensional data, the fitted model data, the predicted NLDR data, and summary information, you can use the following function.
augment(df_bin_centroids = df_bin_centroids, df_bin = df_bin,
training_data = s_curve_noise_training, newdata = NULL,
type_NLDR = "UMAP", col_start = "x") |>
head(5)
#> # A tibble: 5 × 32
#> ID x1 x2 x3 x4 x5 x6 x7 pred_hb_id
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 1 -0.120 0.114 -1.99 -0.00246 -0.0178 -0.0181 -0.00317 19
#> 2 2 -0.0492 0.822 0.00121 0.0161 0.00968 -0.0834 0.00230 41
#> 3 3 -0.774 0.243 0.367 -0.0198 0.00408 -0.0349 -0.00911 47
#> 4 4 -0.606 1.96 -1.80 0.0132 -0.000479 -0.00478 -0.00843 8
#> 5 6 0.818 0.0388 -1.58 0.00253 0.00167 0.0781 -0.00771 20
#> # ℹ 23 more variables: model_high_d_x1 <dbl>, model_high_d_x2 <dbl>,
#> # model_high_d_x3 <dbl>, model_high_d_x4 <dbl>, model_high_d_x5 <dbl>,
#> # model_high_d_x6 <dbl>, model_high_d_x7 <dbl>, error_square_x1 <dbl>,
#> # error_square_x2 <dbl>, error_square_x3 <dbl>, error_square_x4 <dbl>,
#> # error_square_x5 <dbl>, error_square_x6 <dbl>, error_square_x7 <dbl>,
#> # row_wise_total_error <dbl>, abs_error_x1 <dbl>, abs_error_x2 <dbl>,
#> # abs_error_x3 <dbl>, abs_error_x4 <dbl>, abs_error_x5 <dbl>, …
The same workflow is followed for the test data as well.
pred_df_test <- predict_emb(test_data = s_curve_noise_test,
df_bin_centroids = df_bin_centroids,
df_bin = df_bin, type_NLDR = "UMAP")
s_curve_noise_umap_scaled |>
ggplot(aes(x = UMAP1,
y = UMAP2,
label = ID))+
geom_point(alpha=0.5) +
geom_point(data = pred_df_test, aes(x = pred_UMAP_1, y = pred_UMAP_2),
color = "red", alpha=0.5) +
coord_equal() +
theme(plot.title = element_text(hjust = 0.5, size = 18, face = "bold"),
axis.text = element_text(size = 5),
axis.title = element_text(size = 7))
glance(df_bin_centroids = df_bin_centroids, df_bin = df_bin,
training_data = s_curve_noise_training, newdata = s_curve_noise_test,
type_NLDR = "UMAP", col_start = "x")
#> # A tibble: 1 × 2
#> Error MSE
#> <dbl> <dbl>
#> 1 14.0 0.136
augment(df_bin_centroids = df_bin_centroids, df_bin = df_bin,
training_data = s_curve_noise_training, newdata = s_curve_noise_test,
type_NLDR = "UMAP", col_start = "x") |>
head(5)
#> # A tibble: 5 × 32
#> ID x1 x2 x3 x4 x5 x6 x7 pred_hb_id
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 5 -0.478 1.61 -1.88 0.0101 -0.00746 0.0101 0.00972 8
#> 2 10 -0.727 1.66 0.314 0.00269 0.0196 0.0559 -0.00481 46
#> 3 13 0.513 1.86 1.86 -0.00648 -0.0127 0.00635 -0.00770 70
#> 4 18 0.0635 1.48 -2.00 0.00458 0.0164 -0.0627 -0.00371 15
#> 5 27 0.918 1.36 -1.40 0.0161 0.0160 -0.0190 0.00341 21
#> # ℹ 23 more variables: model_high_d_x1 <dbl>, model_high_d_x2 <dbl>,
#> # model_high_d_x3 <dbl>, model_high_d_x4 <dbl>, model_high_d_x5 <dbl>,
#> # model_high_d_x6 <dbl>, model_high_d_x7 <dbl>, error_square_x1 <dbl>,
#> # error_square_x2 <dbl>, error_square_x3 <dbl>, error_square_x4 <dbl>,
#> # error_square_x5 <dbl>, error_square_x6 <dbl>, error_square_x7 <dbl>,
#> # row_wise_total_error <dbl>, abs_error_x1 <dbl>, abs_error_x2 <dbl>,
#> # abs_error_x3 <dbl>, abs_error_x4 <dbl>, abs_error_x5 <dbl>, …