Create a Crosstab — crosstab • certetoolbox

Transform a data set into an n x m table, e.g. to be used in certestats::confusion_matrix().

crosstab(
  df,
  identifier,
  compare,
  outcome,
  positive = "^pos.*",
  negative = "^neg.*",
  ...,
  na.rm = TRUE,
  ignore_case = TRUE
)

Arguments

df: a data.frame
identifier: a column name to use as identifier, such as a patient ID or an order ID
compare: a column name for the two axes of the table: the labels between the outcomes must be compared
outcome: a column name containing the outcome values to compare
positive: a regex to match the values in outcome that must be considered as the Positive class, use FALSE to not use a Positive class
negative: a regex to match the values in outcome that must be considered as the Negative class, use FALSE to not use a Negative class
...: manual regexes for classes if not using positive and negative, such as Class1 = "c1", Class2 = "c2", Class3 = "c3"
na.rm: a logical to indicate whether empty values must be removed before forming the table
ignore_case: a logical to indicate whether the case in the values of positive, negative and ... must be ignored

Examples

df <- data.frame(
  order_nr = sort(rep(LETTERS[1:20], 2)),
  test_type = rep(c("Culture", "PCR"), 20),
  result = sample(c("pos", "neg"),
                  size = 40,
                  replace = TRUE,
                  prob = c(0.3, 0.9))
)
head(df)
#>   order_nr test_type result
#> 1        A   Culture    neg
#> 2        A       PCR    pos
#> 3        B   Culture    neg
#> 4        B       PCR    neg
#> 5        C   Culture    neg
#> 6        C       PCR    neg

out <- df |> crosstab(order_nr, test_type, result)
out
#>           PCR
#> Culture    Positive Negative
#>   Positive        0        2
#>   Negative        4       14


df$result <- gsub("pos", "#p", df$result)
df$result <- gsub("neg", "#n", df$result)
head(df)
#>   order_nr test_type result
#> 1        A   Culture     #n
#> 2        A       PCR     #p
#> 3        B   Culture     #n
#> 4        B       PCR     #n
#> 5        C   Culture     #n
#> 6        C       PCR     #n
# gives a warning that pattern matching failed:
df |> crosstab(order_nr, test_type, result)
#> Warning: Check the regular expressions in the 'positive' and 'negative' arguments - they are not both matched
#>           PCR
#> Culture    Positive Negative
#>   Positive        0        0
#>   Negative        0        0

# define the pattern yourself in such case:
df |> crosstab(order_nr, test_type, result,
               positive = "#p",
               negative = "#n")
#>           PCR
#> Culture    Positive Negative
#>   Positive        0        2
#>   Negative        4       14
                             
                             
# defining classes manually, can be more than 2:
df |> crosstab(order_nr, test_type, result,
               ClassA = "#p", Hello = "#n")
#>         PCR
#> Culture  ClassA Hello
#>   ClassA      0     2
#>   Hello       4    14
                             
if ("certestats" %in% rownames(utils::installed.packages())) {
  certestats::confusion_matrix(out)
}
#> 
#> ── Confusion Matrix ────────────────────────────────────────────────────────────
#>           
#>            Positive Negative
#>   Positive        0        2
#>   Negative        4       14
#> 
#> 
#> ── Model Metrics ───────────────────────────────────────────────────────────────
#> 
#> Accuracy                                       0.700
#> Area under the Precision Recall Curve (APRC)   0.125
#> Area under the Receiver Operator Curve (AROC)  0.611
#> Balanced Accuracy                              0.389
#> Brier Score for Classification Models (BSCM)   3.100
#> Costs Function for Poor Classification (CFPC)  1.500
#> F Measure                                      0.000
#> Gain Capture                                   0.222
#> J-Index                                       -0.222
#> Kappa                                         -0.154
#> Matthews Correlation Coefficient (MCC)        -0.167
#> Mean log Loss for Multinomial Data (MLMD)     32.439
#> Negative Predictive Value (NPV)                0.875
#> Positive Predictive Value (PPV)                0.000
#> Precision                                      0.000
#> Prevalence                                     0.200
#> Recall                                         0.000
#> Sensitivity                                    0.000
#> Specificity                                    0.778