Skip to contents

Safe correlation function which returns a sparse matrix without missing values

Usage

sparse_cor(
  x,
  y = NULL,
  method = "pearson",
  allow_neg = TRUE,
  remove_na = TRUE,
  remove_inf = TRUE,
  ...
)

Arguments

x

Sparse matrix or character vector.

y

Sparse matrix or character vector.

method

Method to use for calculating the correlation coefficient.

allow_neg

Logical. Whether to allow negative values or set them to 0.

remove_na

Logical. Whether to replace NA values with 0.

remove_inf

Logical. Whether to replace infinite values with 1.

...

Other arguments passed to stats::cor function.

Value

A correlation matrix.

Examples

m1 <- simulate_sparse_matrix(
  500, 100
)
m2 <- simulate_sparse_matrix(
  500, 100,
  seed = 2025
)
a <- sparse_cor(m1)
b <- sparse_cor(m1, m2)
c <- as_matrix(
  cor(as_matrix(m1)),
  sparse = TRUE
)
d <- as_matrix(
  cor(as_matrix(m1), as_matrix(m2)),
  sparse = TRUE
)

a[1:5, 1:5]
#> 5 x 5 sparse Matrix of class "dsCMatrix"
#>             col_1       col_2       col_3       col_4       col_5
#> col_1  1.00000000  0.03982146  0.03290085 -0.02022058  0.00827069
#> col_2  0.03982146  1.00000000 -0.04172518 -0.00276169 -0.03594182
#> col_3  0.03290085 -0.04172518  1.00000000  0.03481704 -0.03144989
#> col_4 -0.02022058 -0.00276169  0.03481704  1.00000000 -0.05034769
#> col_5  0.00827069 -0.03594182 -0.03144989 -0.05034769  1.00000000
c[1:5, 1:5]
#> 5 x 5 sparse Matrix of class "dsCMatrix"
#>             col_1       col_2       col_3       col_4       col_5
#> col_1  1.00000000  0.03982146  0.03290085 -0.02022058  0.00827069
#> col_2  0.03982146  1.00000000 -0.04172518 -0.00276169 -0.03594182
#> col_3  0.03290085 -0.04172518  1.00000000  0.03481704 -0.03144989
#> col_4 -0.02022058 -0.00276169  0.03481704  1.00000000 -0.05034769
#> col_5  0.00827069 -0.03594182 -0.03144989 -0.05034769  1.00000000
all.equal(a, c)
#> [1] "Attributes: < Component “i”: Numeric: lengths (5047, 5050) differ >"  
#> [2] "Attributes: < Component “p”: Mean relative difference: 0.0008422367 >"
#> [3] "Attributes: < Component “x”: Numeric: lengths (5047, 5050) differ >"  

b[1:5, 1:5]
#> 5 x 5 sparse Matrix of class "dgCMatrix"
#>              col_1        col_2       col_3       col_4        col_5
#> col_1  0.003888632 -0.036322389  0.03054830  0.04990672  0.053832055
#> col_2 -0.042598718 -0.044702907  0.05688602 -0.02310432  0.007310609
#> col_3 -0.003132754 -0.041963945 -0.04085159 -0.03183016 -0.028050257
#> col_4  0.045648545  0.009914274  0.01685729  0.03317848 -0.045443563
#> col_5 -0.048082595 -0.050457664  0.04350114 -0.05551273 -0.051113224
d[1:5, 1:5]
#> 5 x 5 sparse Matrix of class "dgCMatrix"
#>              col_1        col_2       col_3       col_4        col_5
#> col_1  0.003888632 -0.036322389  0.03054830  0.04990672  0.053832055
#> col_2 -0.042598718 -0.044702907  0.05688602 -0.02310432  0.007310609
#> col_3 -0.003132754 -0.041963945 -0.04085159 -0.03183016 -0.028050257
#> col_4  0.045648545  0.009914274  0.01685729  0.03317848 -0.045443563
#> col_5 -0.048082595 -0.050457664  0.04350114 -0.05551273 -0.051113224
all.equal(b, d)
#> [1] "Attributes: < Component “i”: Numeric: lengths (9997, 10000) differ >"
#> [2] "Attributes: < Component “p”: Mean relative difference: 0.000311165 >"
#> [3] "Attributes: < Component “x”: Numeric: lengths (9997, 10000) differ >"

m1[sample(1:500, 10)] <- NA
m2[sample(1:500, 10)] <- NA

sparse_cor(m1, m2)[1:5, 1:5]
#> 5 x 5 sparse Matrix of class "dgCMatrix"
#>       col_1        col_2       col_3       col_4        col_5
#> col_1     .  .            .           .           .          
#> col_2     . -0.044702907  0.05688602 -0.02310432  0.007310609
#> col_3     . -0.041963945 -0.04085159 -0.03183016 -0.028050257
#> col_4     .  0.009914274  0.01685729  0.03317848 -0.045443563
#> col_5     . -0.050457664  0.04350114 -0.05551273 -0.051113224

system.time(
  sparse_cor(m1)
)
#>    user  system elapsed 
#>   0.003   0.000   0.003 
system.time(
  cor(as_matrix(m1))
)
#>    user  system elapsed 
#>   0.006   0.000   0.006 

system.time(
  sparse_cor(m1, m2)
)
#>    user  system elapsed 
#>   0.002   0.000   0.001 
system.time(
  cor(as_matrix(m1), as_matrix(m2))
)
#>    user  system elapsed 
#>    0.01    0.00    0.01