Transform a data set into an n x m table, e.g. to be used in certestats::confusion_matrix().

crosstab(
  df,
  identifier,
  compare,
  outcome,
  positive = "^pos.*",
  negative = "^neg.*",
  ...,
  na.rm = TRUE,
  ignore_case = TRUE
)

Arguments

df

a data.frame

identifier

a column name to use as identifier, such as a patient ID or an order ID

compare

a column name for the two axes of the table: the labels between the outcomes must be compared

outcome

a column name containing the outcome values to compare

positive

a stringr::regex to match the values in outcome that must be considered as the Positive class, use FALSE to not use a Positive class

negative

a stringr::regex to match the values in outcome that must be considered as the Negative class, use FALSE to not use a Negative class

...

manual stringr::regexes for classes if not using positive and negative, such as Class1 = "c1", Class2 = "c2", Class3 = "c3"

na.rm

a logical to indicate whether empty values must be removed before forming the table

ignore_case

a logical to indicate whether the case in the values of positive, negative and ... must be ignored

Examples

df <- data.frame(
  order_nr = sort(rep(LETTERS[1:20], 2)),
  test_type = rep(c("Culture", "PCR"), 20),
  result = sample(c("pos", "neg"),
                  size = 40,
                  replace = TRUE,
                  prob = c(0.3, 0.9))
)
head(df)
#>   order_nr test_type result
#> 1        A   Culture    neg
#> 2        A       PCR    pos
#> 3        B   Culture    neg
#> 4        B       PCR    neg
#> 5        C   Culture    neg
#> 6        C       PCR    neg

out <- df |> crosstab(order_nr, test_type, result)
out
#>           PCR
#> Culture    Positive Negative
#>   Positive        0        2
#>   Negative        4       14


df$result <- gsub("pos", "#p", df$result)
df$result <- gsub("neg", "#n", df$result)
head(df)
#>   order_nr test_type result
#> 1        A   Culture     #n
#> 2        A       PCR     #p
#> 3        B   Culture     #n
#> 4        B       PCR     #n
#> 5        C   Culture     #n
#> 6        C       PCR     #n
# gives a warning that pattern matching failed:
df |> crosstab(order_nr, test_type, result)
#> Warning: Check the regular expressions in the 'positive' and 'negative' arguments - they are not both matched
#>           PCR
#> Culture    Positive Negative
#>   Positive        0        0
#>   Negative        0        0

# define the pattern yourself in such case:
df |> crosstab(order_nr, test_type, result,
               positive = "#p",
               negative = "#n")
#>           PCR
#> Culture    Positive Negative
#>   Positive        0        2
#>   Negative        4       14
                             
                             
# defining classes manually, can be more than 2:
df |> crosstab(order_nr, test_type, result,
               ClassA = "#p", Hello = "#n")
#>         PCR
#> Culture  ClassA Hello
#>   ClassA      0     2
#>   Hello       4    14
                             
if ("certestats" %in% rownames(utils::installed.packages())) {
  certestats::confusion_matrix(out)
}
#> 
#> ── Confusion Matrix ────────────────────────────────────────────────────────────
#> 
#>           Predicted
#> Actual     Positive Negative
#>   Positive        0        2
#>   Negative        4       14
#> 
#> ── Model Metrics ───────────────────────────────────────────────────────────────
#>                                               
#>  Accuracy                                0.700
#>  Balanced Accuracy                       0.438
#>  F1 Score                                0.000
#>  J-Index                                -0.125
#>  Kappa                                  -0.154
#>  Matthews Correlation Coefficient (MCC) -0.167
#>  Negative Predictive Value (NPV)         0.778
#>  Positive Predictive Value (PPV)         0.000
#>  Precision                               0.000
#>  Prevalence                              0.100
#>  Recall                                  0.000
#>  Sensitivity                             0.000
#>  Specificity                             0.875
#> 
#> ── Model Interpretation ────────────────────────────────────────────────────────
#> 
#> Overall performance is unacceptable. Accuracy (70.0%) and balanced accuracy
#> (43.8%) indicate limited separation between classes. Agreement between
#> predicted and true classes is strong (Cohen's Kappa = -15.4%, MCC = -16.7%).
#> These account for chance agreement and are robust to class imbalance. Precision
#> and recall (both 0.0%) are perfectly aligned, indicating an ideally balanced
#> trade-off between false positives and missed true cases. The macro-averaged F1
#> score is 0.0%, indicating balanced harmonic performance across classes. The
#> model's ability to rule out incorrect classes is strong, with specificity at
#> 87.5% and negative predictive value at 77.8%. Most misclassifications are
#> concentrated between a small number of class pairs, indicating overlap between
#> specific categories rather than random error. Class imbalance is present
#> (max:minor support ratio = 9.00). While macro-averaging mitigates this, some
#> metrics may still overestimate performance on minority classes.