MinSNPs_Workflow

library(minSNPs)
library(Biostrings) # optional
#> Loading required package: BiocGenerics
#> Loading required package: parallel
#> 
#> Attaching package: 'BiocGenerics'
#> The following objects are masked from 'package:parallel':
#> 
#>     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
#>     clusterExport, clusterMap, parApply, parCapply, parLapply,
#>     parLapplyLB, parRapply, parSapply, parSapplyLB
#> The following objects are masked from 'package:stats':
#> 
#>     IQR, mad, sd, var, xtabs
#> The following objects are masked from 'package:base':
#> 
#>     Filter, Find, Map, Position, Reduce, anyDuplicated, append,
#>     as.data.frame, basename, cbind, colnames, dirname, do.call,
#>     duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
#>     lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
#>     pmin.int, rank, rbind, rownames, sapply, setdiff, sort, table,
#>     tapply, union, unique, unsplit, which, which.max, which.min
#> Loading required package: S4Vectors
#> Loading required package: stats4
#> 
#> Attaching package: 'S4Vectors'
#> The following object is masked from 'package:base':
#> 
#>     expand.grid
#> Loading required package: IRanges
#> Loading required package: XVector
#> 
#> Attaching package: 'Biostrings'
#> The following object is masked from 'package:base':
#> 
#>     strsplit
library(BiocParallel) # optional, but needed for parallel processing

Reading & processing input for further analysis

read_fasta is provided as a way to read fasta file, equivalent function, e.g., from Biostrings and read.fasta from seqinr can be used.

isolates_from_biostring <- readBStringSet(
  system.file("extdata", "Chlamydia_mapped.fasta", package = "minSNPs"))
isolates_from_default <- read_fasta(
  system.file("extdata", "Chlamydia_mapped.fasta", package = "minSNPs"))
processed_from_biostring <- process_allele(isolates_from_biostring)
#> Ignored samples: 
#>  
#> Ignored  0  positions
processed_from_default <- process_allele(isolates_from_default)
#> Ignored samples: 
#>  
#> Ignored  0  positions

Subsequent analyses can use output from process_allele.

Identifying SNPs with high Simpson’s index

high_d_snps <- find_optimised_snps(seqc = processed_from_biostring,
  metric = "simpson", number_of_result = 1, max_depth = 1,
  included_positions = c(), excluded_positions = c())

Identifying SNPs discriminating a group of interest

discriminating_snps <- find_optimised_snps(seqc = processed_from_biostring,
  metric = "percent", number_of_result = 1, max_depth = 1,
  included_positions = c(), excluded_positions = c(),
  goi = c("A_D213", "H_S1432"))

Displaying/saving result

cat("High D SNPs\n")
#> High D SNPs
output_result(high_d_snps)
#> Result - 1
#> Position(s)  Score
#> 1988 0.734415584415584
#> 
#> Groups
#> T    A_D213, H_S1432, Ia_SotoGIa3, Ia_SotoGIa1, C_UW1, A_7249, A_HAR-13, C_Aus10, H_R31975, A_2497, L3_404, K_SotoGK1, A_363, J_6276, A_5291, C_TW3
#> G    B_Aus3, L1_440, Ba_Aus25, B_TZ1A828, Ba_Apache2, D_UW-3, Ds_2923, L1_SA16, B_Har36, D_SotoGD6, D_SotoGD5, B_Jali20, D_SotoGD1
#> A    F_SW5, L2b_CV204, L2_LST, L2b_UCH-1, L2b_795, L1_224, G_11074, F_70, L2b_C1, G_SotoGG1, G_11222, L2b_8200, F_SW4, G_9301, L2b_C2, G_9768, L2_434, F_SotoGF3, L1_115, L2b_UCH-2
#> C    E_150, E_Bour, E_SotoGE4, E_SW2, E_SW3, E_SotoGE8, E_11023
#> 
#> 
#> Additional details
#> Metric:   simpson
#> Excluded Positions:   
#> Excluded Positions From process_allele:   
#> Included Positions:   
#> Group of interest:    
#> All analysed sequences:   A_D213, H_S1432, Ia_SotoGIa3, B_Aus3, Ia_SotoGIa1, C_UW1, F_SW5, L2b_CV204, L1_440, A_7249, Ba_Aus25, L2_LST, B_TZ1A828, A_HAR-13, E_150, E_Bour, C_Aus10, H_R31975, E_SotoGE4, L2b_UCH-1, L2b_795, A_2497, Ba_Apache2, E_SW2, L3_404, L1_224, D_UW-3, G_11074, Ds_2923, F_70, K_SotoGK1, E_SW3, L2b_C1, E_SotoGE8, G_SotoGG1, G_11222, A_363, L1_SA16, L2b_8200, J_6276, F_SW4, G_9301, L2b_C2, A_5291, G_9768, L2_434, F_SotoGF3, C_TW3, E_11023, L1_115, B_Har36, L2b_UCH-2, D_SotoGD6, D_SotoGD5, B_Jali20, D_SotoGD1
cat("SNPs discriminating against A_D213, H_S1432\n")
#> SNPs discriminating against A_D213, H_S1432
output_result(discriminating_snps)
#> Result - 1
#> Position(s)  Score
#> 1806 0.944444444444444
#> 
#> Groups
#> *target* - G A_D213, H_S1432, C_UW1, C_Aus10, C_TW3
#> C    Ia_SotoGIa3, Ia_SotoGIa1, A_7249, A_HAR-13, A_2497, D_UW-3, Ds_2923, K_SotoGK1, A_363, J_6276, A_5291, D_SotoGD6, D_SotoGD5, D_SotoGD1
#> A    B_Aus3, F_SW5, L2b_CV204, L1_440, Ba_Aus25, L2_LST, B_TZ1A828, E_150, E_Bour, H_R31975, E_SotoGE4, L2b_UCH-1, L2b_795, Ba_Apache2, E_SW2, L3_404, L1_224, G_11074, F_70, E_SW3, L2b_C1, E_SotoGE8, G_SotoGG1, G_11222, L1_SA16, L2b_8200, F_SW4, G_9301, L2b_C2, G_9768, L2_434, F_SotoGF3, E_11023, L1_115, B_Har36, L2b_UCH-2, B_Jali20
#> 
#> 
#> Additional details
#> Metric:   percent
#> Excluded Positions:   
#> Excluded Positions From process_allele:   
#> Included Positions:   
#> Group of interest:    A_D213, H_S1432
#> All analysed sequences:   A_D213, H_S1432, Ia_SotoGIa3, B_Aus3, Ia_SotoGIa1, C_UW1, F_SW5, L2b_CV204, L1_440, A_7249, Ba_Aus25, L2_LST, B_TZ1A828, A_HAR-13, E_150, E_Bour, C_Aus10, H_R31975, E_SotoGE4, L2b_UCH-1, L2b_795, A_2497, Ba_Apache2, E_SW2, L3_404, L1_224, D_UW-3, G_11074, Ds_2923, F_70, K_SotoGK1, E_SW3, L2b_C1, E_SotoGE8, G_SotoGG1, G_11222, A_363, L1_SA16, L2b_8200, J_6276, F_SW4, G_9301, L2b_C2, A_5291, G_9768, L2_434, F_SotoGF3, C_TW3, E_11023, L1_115, B_Har36, L2b_UCH-2, D_SotoGD6, D_SotoGD5, B_Jali20, D_SotoGD1
output_result(high_d_snps, view = "csv",
  file_name = "high_d_snps.csv")
output_result(discriminating_snps, view = "csv",
  file_name = "discriminating_snps.csv")