Skip to contents

To obtain the model summaries, first you need to obtain the 2D hexagonal bin centroids which is the 2D model (df_bin_centroids) and the high-dimensional coordinates of bin centroids which represents the lifted model in high-dimensions (df_bin).

r2 <- diff(range(s_curve_noise_umap$UMAP2))/diff(range(s_curve_noise_umap$UMAP1))
model <- fit_highd_model(training_data = s_curve_noise_training, 
                         emb_df = s_curve_noise_umap_scaled,
                         bin1 = 6, r2 = r2,
                         col_start_highd = "x")
df_bin_centroids <- model$df_bin_centroids
df_bin <- model$df_bin

Let’s first compute the model summaries for training data. To do that, you need to predict 2D embedding for the training data.

pred_df_training <- predict_emb(test_data = s_curve_noise_training, 
                             df_bin_centroids = df_bin_centroids, 
                             df_bin = df_bin, type_NLDR = "UMAP")
glimpse(pred_df_training)
#> Rows: 3,750
#> Columns: 4
#> $ pred_UMAP_1 <dbl> 0.226810215, 0.989367382, 0.771493906, 0.117873476, 0.4446…
#> $ pred_UMAP_2 <dbl> 0.84959221, 0.28354031, 0.28354031, 0.66090824, 0.84959221…
#> $ ID          <int> 1, 2, 3, 5, 6, 7, 9, 10, 11, 12, 15, 16, 18, 19, 24, 25, 2…
#> $ pred_hb_id  <int> 32, 18, 17, 26, 33, 7, 35, 17, 18, 14, 32, 2, 39, 29, 38, …
s_curve_noise_umap_scaled |>
    ggplot(aes(x = UMAP1,
               y = UMAP2,
               label = ID))+
    geom_point(alpha=0.5) +
    geom_point(data = pred_df_training, aes(x = pred_UMAP_1, y = pred_UMAP_2), 
               color = "red", alpha=0.5) +
    coord_equal() +
    theme(plot.title = element_text(hjust = 0.5, size = 18, face = "bold"),
          axis.text = element_text(size = 5),
          axis.title = element_text(size = 7))

Next, you can use the generated 2D embedding to compute model summaries.

glance(df_bin_centroids = df_bin_centroids, df_bin = df_bin, 
        training_data = s_curve_noise_training, newdata = NULL, 
        type_NLDR = "UMAP", col_start = "x")
#> # A tibble: 1 × 2
#>   Error   MSE
#>   <dbl> <dbl>
#> 1 2226. 0.153

To obtain a dataframe with the high-dimensional data, the fitted model data, the predicted NLDR data, and summary information, you can use the following function.

augment(df_bin_centroids = df_bin_centroids, df_bin = df_bin, 
        training_data = s_curve_noise_training, newdata = NULL, 
        type_NLDR = "UMAP", col_start = "x") |>
  head(5)
#> # A tibble: 5 × 32
#>      ID      x1     x2       x3       x4       x5      x6        x7 pred_hb_id
#>   <int>   <dbl>  <dbl>    <dbl>    <dbl>    <dbl>   <dbl>     <dbl>      <int>
#> 1     1 -0.120  1.64   -1.99     0.0104   0.0125   0.0923 -0.00128          32
#> 2     2 -0.0492 1.51    0.00121 -0.0177   0.00726 -0.0362 -0.00535          18
#> 3     3 -0.774  1.30    0.367   -0.00173  0.0156  -0.0962  0.00335          17
#> 4     5 -0.478  0.0177 -1.88     0.00848  0.00533  0.0998  0.000677         26
#> 5     6  0.818  0.927  -1.58    -0.00318 -0.00980  0.0989  0.00696          33
#> # ℹ 23 more variables: model_high_d_x1 <dbl>, model_high_d_x2 <dbl>,
#> #   model_high_d_x3 <dbl>, model_high_d_x4 <dbl>, model_high_d_x5 <dbl>,
#> #   model_high_d_x6 <dbl>, model_high_d_x7 <dbl>, error_square_x1 <dbl>,
#> #   error_square_x2 <dbl>, error_square_x3 <dbl>, error_square_x4 <dbl>,
#> #   error_square_x5 <dbl>, error_square_x6 <dbl>, error_square_x7 <dbl>,
#> #   row_wise_total_error <dbl>, abs_error_x1 <dbl>, abs_error_x2 <dbl>,
#> #   abs_error_x3 <dbl>, abs_error_x4 <dbl>, abs_error_x5 <dbl>, …

The same workflow is followed for the test data as well.

pred_df_test <- predict_emb(test_data = s_curve_noise_test, 
                             df_bin_centroids = df_bin_centroids, 
                             df_bin = df_bin, type_NLDR = "UMAP")
s_curve_noise_umap_scaled |>
    ggplot(aes(x = UMAP1,
               y = UMAP2,
               label = ID))+
    geom_point(alpha=0.5) +
    geom_point(data = pred_df_test, aes(x = pred_UMAP_1, y = pred_UMAP_2), 
               color = "red", alpha=0.5) +
    coord_equal() +
    theme(plot.title = element_text(hjust = 0.5, size = 18, face = "bold"),
          axis.text = element_text(size = 5),
          axis.title = element_text(size = 7))

glance(df_bin_centroids = df_bin_centroids, df_bin = df_bin, 
        training_data = s_curve_noise_training, newdata = s_curve_noise_test, 
        type_NLDR = "UMAP", col_start = "x")
#> # A tibble: 1 × 2
#>   Error   MSE
#>   <dbl> <dbl>
#> 1  758. 0.156
augment(df_bin_centroids = df_bin_centroids, df_bin = df_bin, 
        training_data = s_curve_noise_training, newdata = s_curve_noise_test, 
        type_NLDR = "UMAP", col_start = "x") |>
  head(5)
#> # A tibble: 5 × 32
#>      ID      x1    x2       x3       x4       x5      x6        x7 pred_hb_id
#>   <int>   <dbl> <dbl>    <dbl>    <dbl>    <dbl>   <dbl>     <dbl>      <int>
#> 1     4 -0.606  0.246 -1.80    -0.00897 -0.0187  -0.0716  0.00126          26
#> 2     8 -0.0691 1.59   0.00239  0.0127  -0.0130   0.0396 -0.000185         18
#> 3    13  0.513  1.02   1.86     0.0141  -0.0149   0.0619 -0.00309           8
#> 4    14  0.869  0.576 -0.505   -0.0196   0.00169 -0.0197  0.00597          29
#> 5    17 -0.737  1.94  -1.68     0.00601 -0.0113   0.0301 -0.00988          38
#> # ℹ 23 more variables: model_high_d_x1 <dbl>, model_high_d_x2 <dbl>,
#> #   model_high_d_x3 <dbl>, model_high_d_x4 <dbl>, model_high_d_x5 <dbl>,
#> #   model_high_d_x6 <dbl>, model_high_d_x7 <dbl>, error_square_x1 <dbl>,
#> #   error_square_x2 <dbl>, error_square_x3 <dbl>, error_square_x4 <dbl>,
#> #   error_square_x5 <dbl>, error_square_x6 <dbl>, error_square_x7 <dbl>,
#> #   row_wise_total_error <dbl>, abs_error_x1 <dbl>, abs_error_x2 <dbl>,
#> #   abs_error_x3 <dbl>, abs_error_x4 <dbl>, abs_error_x5 <dbl>, …