SPAAM-community
diff --git a/‎assets/images/chapters/authentication/BioxRiv_paper.png‎
-336 KB b/‎assets/images/chapters/authentication/BioxRiv_paper.png‎
-336 KB
diff --git a/‎assets/images/chapters/authentication/p1.png‎
94.1 KB b/‎assets/images/chapters/authentication/p1.png‎
94.1 KB
diff --git a/‎assets/images/chapters/authentication/p2.png‎
167 KB b/‎assets/images/chapters/authentication/p2.png‎
167 KB
diff --git a/‎assets/images/chapters/authentication/p3.png‎
-131 KB b/‎assets/images/chapters/authentication/p3.png‎
-131 KB
diff --git a/‎assets/images/chapters/authentication/paper.png‎
1.91 MB b/‎assets/images/chapters/authentication/paper.png‎
1.91 MB
diff --git a/‎assets/references/authentication.bib‎
Lines changed: 11 additions & 11 deletions b/‎assets/references/authentication.bib‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎authentication.qmd‎
Lines changed: 108 additions & 119 deletions b/‎authentication.qmd‎
Lines changed: 108 additions & 119 deletions
@@ -32,17 +32,17 @@ @unpublished{Pochon2022-hj
   language = {en}
 }
 
-@article {Zampirolo2023.12.01.569562,
-	author = {Zampirolo, Giulia and Holman, Luke E. and Sawafuji, Rikai and Pt{\'a}kov{\'a}, Michaela and Kova{\v c}ikov{\'a}, Lenka and {\v S}{\'\i}da, Petr and Pokorn{\'y}, Petr and Pedersen, Mikkel Winther and Walls, Matthew},
-	title = {Early Pastoralism in Central European Forests: Insights from Ancient Environmental Genomics},
-	elocation-id = {2023.12.01.569562},
-	year = {2023},
-	doi = {10.1101/2023.12.01.569562},
-	publisher = {Cold Spring Harbor Laboratory},
-	abstract = {Central European forests have been shaped by complex human interactions throughout the Holocene, with significant changes following the introduction of domesticated animals in the Neolithic (\~{}7.5 {\textendash} 6.0 kyr BP). However, understanding early pastoral practices and their impact on forests is limited by methods for detecting animal movement across past landscapes. Here we examine ancient sedimentary DNA (sedaDNA) preserved at the Velk{\'y} Mamu{\v t}{\'a}k rock shelter, in northern Bohemia (Czech Republic), which has been a forested enclave since the early Holocene. We find that domesticated animals, their associated microbiomes, and plants potentially gathered for fodder, have clear representation by the Late Neolithic, around 6.0 kyr BP, and persist throughout the Bronze Age into recent times. We identify a change in dominant grazing species from sheep to pigs in the Bronze Age (\~{}4.1 {\textendash} 3.0 kyr BP) and interpret the impact this had in the mid-Holocene retrogressions that still define the structure of Central European forests today. This study highlights the ability of ancient metagenomics to bridge archaeological and paleoecological methods and provide an enhanced perspective on the roots of the Anthropocene.Competing Interest StatementThe authors have declared no competing interest.},
-	URL = {https://www.biorxiv.org/content/early/2023/12/03/2023.12.01.569562},
-	eprint = {https://www.biorxiv.org/content/early/2023/12/03/2023.12.01.569562.full.pdf},
-	journal = {bioRxiv}
+@article{zampirolo2024tracing,
+  title={Tracing early pastoralism in Central Europe using sedimentary ancient DNA},
+  author={Zampirolo, Giulia and Holman, Luke E and Sawafuji, Rikai and Pt{\'a}kov{\'a}, Michaela and Kova{\v{c}}ikov{\'a}, Lenka and {\v{S}}{\'\i}da, Petr and Pokorn{\`y}, Petr and Pedersen, Mikkel Winther and Walls, Matthew},
+  journal={Current Biology},
+  volume={34},
+  number={20},
+  pages={4650--4661},
+  year={2024},
+  publisher={Elsevier}
+doi= {10.1016/j.cub.2024.08.047}
+url = {https://doi.org/10.1016/j.cub.2024.08.047}
 }
 
 @article{Pedersen2015,
 
@@ -475,9 +475,9 @@ For custom reference genomes not covered by NCBI, their accession IDs and the co
 The `ngsLCA` program considers a chosen similarity interval between each read and its reference in the generated bam/sam file.
 The similarity can be set as an edit distance `[-editdist[min/max]]`, i.e., number of mismatches between the read to reference genome, or as a similarity distance `[-simscore[low/high]]`, i.e., percentage of mismatches between the read to reference genome. 
 
-The main files produced by this command have the extensions `.bdamage.gz` and `lca.gz`.
+The main files produced by this command have the extensions `.bdamage.gz`, `lca.gz` and 'stat.gz'.
 The first consists of a nucleotide misincorporation matrix (also called **mismatch matrix**) which represents the nucleotide substitution counts across the reads (@tbl-authentication-examplecodetable2).
-The lca file reports the sequence analysed and its taxonomic path, as well as other statistics (gc content, fragment length).
+The lca file reports the sequences analysed and their taxonomic paths, while the stat file includes other statistics (gc content, fragment length).
 
 We report an example of the bdamage.gz file output printed using the command `metaDMG-cpp print`:
 
@@ -564,16 +564,16 @@ While Pitch-6 and Cave-22 samples, which are 6 and 22 thousand years old and thu
 
 ### Ancient metagenomic dataset
 
-In this section, we will use 6 metagenomic libraries downsampled with eukaryotes reads from the study by [@Zampirolo2023.12.01.569562] (@fig-authentication-fig6).
+In this section, we will use 6 metagenomic libraries downsampled with eukaryotes reads from the study by [@zampirolo2024tracing] (@fig-authentication-fig6).
 The libraries originate from sediment samples of the Velký Mamut'ák rock shelter located in Northern Bohemia (Czech Republic) and covering the period between the Late Neolithic (~6100-5300 cal. BP) to more recent times (800 cal BP). 
 
-![Screenshot of preprint of the source dataset by [@Zampirolo2023.12.01.569562]](assets/images/chapters/authentication/BioxRiv_paper.png){#fig-authentication-fig6}
+![Screenshot of the study of the source dataset by [@zampirolo2024tracing]](assets/images/chapters/authentication/paper.png){#fig-authentication-fig6}
 
 ### Ancient metagenomics with metaDMG-cpp: the workflow
 
 This section will cover the metaDMG analysis which involve taxonomic classification of the reads starting from sorted SAM files, the damage estimation and compilation of the final metaDMG output.
 
-To begin, we can find raw SAM files used as input to `metaDMG` we will use for the exercise are stored in `metadmg`.
+To begin, we can find raw SAM files used as input to `metaDMG` we will use for the exercise are stored in the `metadmg` folder.
 
 We also need the taxonomy files, which are in the folder `metadmg/small_taxonomy/`, these include `names.dmp`, `nodes.dmp` and `small_accession2taxid.txt.gz`. 
 
@@ -582,16 +582,10 @@ We also need the taxonomy files, which are in the folder `metadmg/small_taxonomy
 The best documentation is currently found in the –help function.
 :::
 
-We need to activate a dedicated environment for `metaDMG` as it is still under development. We candeactivate the current one with
+metaDMG is installed in the conda environment 'authentication`. If not activated yet, we run 
 
 ```bash
-conda deactivate 
-```
-
-And we will work with metaDMG by activating the environment with the following command.
-
-```bash
-conda activate metaDMG
+conda activate authentication 
 ```
 
 :::{.callout-warning}
@@ -701,16 +695,9 @@ Open the TSV file `concatenated_metaDMGfinal.tsv` in a spreadsheet manner and in
 
 We will now investigate the TSV table produced by metaDMG to authenticate damage patterns, visualise the relationship between the damage and the significance, and the degree of damage through depth and time.
 
-R packages for this exercise are located in our original conda environment `authentication`.
+R packages for this exercise are located in the same conda environment `authentication`.
 
-While still in the `authentication/metadmg/` folder, We deactivate the current conda environment and we re-activate the environment `authentication`.
-
-```bash
-conda deactivate
-conda activate authentication
-```
-
-We load R by running `R` in your terminal
+While still in the `authentication/metadmg/` folder, we load R by running `R` in your terminal
 
 ```bash
 R
@@ -729,6 +716,96 @@ library(purrr)
 library(ggpubr)
 ``` 
 
+### Amplitude of damage vs Significance
+
+We provide an R script to investigate the main statistics. 
+
+Here we visualise the amplitude of damage (A) and its significance (Zfit), for the full dataset but filtering it to a minimum of 100 reads and at the genus level (@fig-authentication-fig7). 
+
+```{r eval=F}
+#We load our metaDMG output data (TSV file) and the metadata with information on the age of each sample.
+df <- read.csv("concatenated_metaDMGfinal.tsv",  sep = "\t")
+
+#Rename sample column
+colnames(df)[colnames(df) == 'filename'] <- 'sample'
+
+#Modify sample name with short names
+df$sample[df$sample == "VM-11_aggregated_results.stat"] <- "VM-11"
+df$sample[df$sample == "VM-14_aggregated_results.stat"] <- "VM-14"
+df$sample[df$sample == "VM-15_aggregated_results.stat"] <- "VM-15"
+df$sample[df$sample == "VM-17_aggregated_results.stat"] <- "VM-17"
+df$sample[df$sample == "VM-19_aggregated_results.stat"] <- "VM-19"
+df$sample[df$sample == "VM-3_aggregated_results.stat"] <- "VM-3"
+
+#Import the metadata with dates BP
+depth_data <- read.csv ("figures/depth_data.csv", header = TRUE)
+View (depth_data)
+
+#Merge context_data and depth_data with dataframe (adding new column for dates BP)
+df$new <- depth_data$Date_BP[match(df$sample, depth_data$Sample_ID)]
+names(df)[names(df) == 'new'] <- 'Date_BP'
+
+# Convert Date_BP columns to factors (categorical variable) 
+df$Date_BP <- as.factor(df$Date_BP)
+
+#Subset dataset animal and plants at the genus level
+dt1 <- df %>% filter(nreads > 100, grepl("\\bgenus\\b", rank), grepl("Metazoa", taxa_path) | grepl("Viridiplant", taxa_path))
+
+#Adding factor column for Kingdom
+dt1 <- dt1 %>% 
+  mutate(Kingdom =   # creating our new column
+           case_when(grepl("Viridiplant", taxa_path) ~ "Viridiplantae",
+                     grepl("Metazoa",taxa_path) ~ "Metazoa"))
+
+#Plotting  amplitude of damage vs its significance and saving as pdf file
+pdf(file = "figures/p1.pdf", width = 8, height = 6)
+ggplot(dt1, aes(y=A, x=Zfit)) + 
+  geom_point(aes(size=nreads, col=Kingdom)) +
+  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust =1)) +
+  scale_color_manual(values = c("#8B1A1A", "#458B00"))+
+  scale_size_continuous(labels = function(x) format(x, scientific = FALSE)) +
+  xlab("significance") + ylab("damage") + theme_minimal()
+dev.off()
+``` 
+
+![Amplitude of damage (A) vs significance (Zfit) for animals and plants.](assets/images/chapters/authentication/p1.png){#fig-authentication-fig7}
+
+### Amplitude of damage and mean fragment length through time
+
+Here we visualise the amplitude of damage (A) and the mean length of the fragments (mean_rlen) by date (BP) for the filtered dataset with a minimum of 100 reads and at the genus level (@fig-authentication-fig8).
+
+```{r eval=F}
+#Plotting damage (A) by period (dates BP)
+p2a<- dt1 %>%
+  mutate(Date_BP = fct_relevel(Date_BP,
+                             "6100","5300","4100","3900","3000", "800")) %>%
+  ggplot(aes(x=A, y=Date_BP))+ 
+  geom_boxplot(aes(x=A, y=Date_BP, fill = sample))+
+  geom_point(aes(fill = sample), size = 3, shape = 21, color = "black", stroke = .5) +
+  scale_x_continuous(limits = c(0, 0.20), breaks = seq(0, 0.20, by = 0.05)) +
+  theme_minimal() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
+p2a
+
+#Plotting mean length (mean_rlen) by period (dates BP)
+p2b<- dt1 %>%
+  mutate(Date_BP = fct_relevel(Date_BP,
+                             "6100","5300","4100","3900","3000", "800")) %>%
+  ggplot(aes(x=mean_rlen, y=Date_BP))+ 
+  geom_boxplot(aes(x=mean_rlen, y=Date_BP, fill = sample)) +
+  geom_point(aes(fill = sample), size = 3, shape = 21, color = "black", stroke = .5) +
+  scale_x_continuous(limits = c(30, 80), breaks = seq(30, 80, by = 10)) +
+  theme_minimal() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
+p2b
+
+#Combining the plots and saving as pdf file
+pdf(file = "figures/p2.pdf", width = 8, height = 6)
+p2 <- grid.arrange(p2a, p2b,
+                   ncol = 2, nrow = 1)
+dev.off()
+``` 
+
+![Amplitude of damage (A) and mean fragment length (mean_rlen) through time.](assets/images/chapters/authentication/p2.png){#fig-authentication-fig8}
+
 #### Deamination patterns
 
 We run the damage plot to visualise the deamination patterns along forward and reverse strands, and we save the results per each taxon detected in the samples.
@@ -847,65 +924,41 @@ get_dmg_decay_fit <- function(df, orient = "fwd", pos = 30, p_breaks = c(0, 0.1,
 }
 ```
 
-We load our metaDMG output data (TSV file) and the metadata with information on the age of each sample. We generate the damage plots as seen in @fig-authentication-fagusovisdmg using the function `get-damage`.
+We generate the damage plots as seen in @fig-authentication-fagusovisdmg using the function `get-damage`.
 
 ```{r eval=F}
-df <- read.csv("concatenated_metaDMGfinal.tsv",  sep = "\t")
-
-#Rename sample column
-colnames(df)[colnames(df) == 'filename'] <- 'sample'
-
-#Modify sample name with short names
-df$sample[df$sample == "VM-11_aggregated_results.stat"] <- "VM-11"
-df$sample[df$sample == "VM-14_aggregated_results.stat"] <- "VM-14"
-df$sample[df$sample == "VM-15_aggregated_results.stat"] <- "VM-15"
-df$sample[df$sample == "VM-17_aggregated_results.stat"] <- "VM-17"
-df$sample[df$sample == "VM-19_aggregated_results.stat"] <- "VM-19"
-df$sample[df$sample == "VM-3_aggregated_results.stat"] <- "VM-3"
-
-#Import the metadata with dates BP
-depth_data <- read.csv ("figures/depth_data.csv", header = TRUE)
-View (depth_data)
-
-#Merge context_data and depth_data with dataframe (adding new column for dates BP)
-df$new <- depth_data$Date_BP[match(df$sample, depth_data$Sample_ID)]
-names(df)[names(df) == 'new'] <- 'Date_BP'
-
-# Convert Date_BP columns to factors (categorical variable) 
-df$Date_BP <- as.factor(df$Date_BP)
-
 #Setting filtering theshold for ancient reads
 minDMG = 0.02 # filter criteria, plot only taxa above set value
 zfit = 2 # minimum significance, the higher the better, 2 would mean that we estimante the damage with 95% confidence. 
 MinLength = 35 # minimum mean readlength, while we set a hard filter initially while trimming, we would like the mean readlength to be 35 or higher. 
 reads = 200 # number of reads required depends on the amount of damage and the significance
 
 #Subsetting only animals and plants, at the genus level, number of reads > 200.
-dt1 <- df %>% filter(A > minDMG, nreads >= reads, mean_rlen >= MinLength, Zfit  > zfit, grepl("\\bgenus\\b", rank), !grepl("Bacteria",taxa_path))
+dt2 <- df %>% filter(A > minDMG, nreads >= reads, mean_rlen >= MinLength, Zfit  > zfit, grepl("\\bgenus\\b", rank), !grepl("Bacteria",taxa_path))
 
 #deamination plot with facet wrap per each taxon in a sample
-tax_g_list <- unique(dt1$name)
+tax_g_list <- unique(dt2$name)
 nrank <- "rank" # Replace with the actual rank column name
 
 X <- tax_g_list 
 purrr::map(tax_g_list, function(X, nrank) {
-  sel_tax <- dt1 %>%
+  sel_tax <- dt2 %>%
     rename(label = sample) %>%
     filter(name == X) %>%
     filter(rank == rank) %>%
     select(name, label) %>%
     distinct() %>%
     arrange(name)
   if (nrow(sel_tax) > 0) {
-    n_readsa <- dt1 %>%
+    n_readsa <- dt2 %>%
       inner_join(sel_tax) %>%
       filter(rank == rank) %>%
       pull(nreads) %>%
       sum()
     ggpubr::ggarrange(plotlist = list(
-      get_dmg_decay_fit(df = dt1 %>% rename(label = sample) %>% inner_join(sel_tax) %>% filter(rank == rank), orient = "fwd", y_max = 0.70) +
+      get_dmg_decay_fit(df = dt2 %>% rename(label = sample) %>% inner_join(sel_tax) %>% filter(rank == rank), orient = "fwd", y_max = 0.70) +
         ggtitle(paste0(X, " nreads=", n_readsa, " Forward")),
-      get_dmg_decay_fit(df = dt1 %>% rename(label = sample)  %>% inner_join(sel_tax) %>% filter(rank == rank), orient = "rev", y_max = 0.70) +
+      get_dmg_decay_fit(df = dt2 %>% rename(label = sample)  %>% inner_join(sel_tax) %>% filter(rank == rank), orient = "rev", y_max = 0.70) +
         ggtitle(paste0(X, " nreads=", n_readsa, " Reverse"))
     ), align = "hv")
     ggsave(paste0("figures/", X, "-dmg.pdf"), plot = last_plot(), width = 8, height = 4)
@@ -914,71 +967,6 @@ purrr::map(tax_g_list, function(X, nrank) {
 ``` 
 ![Deamination patterns for sheep (*Ovis*) and beech (*Fagus*) reads.](assets/images/chapters/authentication/Fagus_Ovis-dmg.png){#fig-authentication-fagusovisdmg}
 
-### Amplitude of damage vs Significance
-
-We provide an R script to investigate the main statistics. 
-
-Here we visualise the amplitude of damage (A) and its significance (Zfit), for the full dataset but filtering it to a minimum of 100 reads and at the genus level (@fig-authentication-fig8). 
-
-```{r eval=F}
-#Subset dataset animal and plants at the genus level
-dt2 <- df %>% filter(nreads > 100, grepl("\\bgenus\\b", rank), grepl("Metazoa", taxa_path) | grepl("Viridiplant", taxa_path))
-
-#Adding factor column for Kingdom
-dt2 <- dt2 %>% 
-  mutate(Kingdom =   # creating our new column
-           case_when(grepl("Viridiplant", taxa_path) ~ "Viridiplantae",
-                     grepl("Metazoa",taxa_path) ~ "Metazoa"))
-
-#Plotting  amplitude of damage vs its significance and saving as pdf file
-pdf(file = "figures/p1.pdf", width = 8, height = 6)
-ggplot(dt2, aes(y=A, x=Zfit)) + 
-  geom_point(aes(size=nreads, col=Kingdom)) +
-  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust =1)) +
-  scale_color_manual(values = c("#8B1A1A", "#458B00"))+
-  scale_size_continuous(labels = function(x) format(x, scientific = FALSE)) +
-  xlab("significance") + ylab("damage") + theme_minimal()
-dev.off()
-``` 
-
-![Amplitude of damage (A) vs significance (Zfit) for animals and plants.](assets/images/chapters/authentication/p2.png){#fig-authentication-fig8}
-
-### Amplitude of damage and mean fragment length through time
-
-Here we visualise the amplitude of damage (A) and the mean length of the fragments (mean_rlen) by date (BP) for the filtered dataset with a minimum of 100 reads and at the genus level (@fig-authentication-fig9).
-
-```{r eval=F}
-#Plotting damage (A) by period (dates BP)
-p2a<- dt2 %>%
-  mutate(Date_BP = fct_relevel(Date_BP,
-                             "6100","5300","4100","3900","3000", "800")) %>%
-  ggplot(aes(x=A, y=Date_BP))+ 
-  geom_boxplot(aes(x=A, y=Date_BP, fill = sample))+
-  geom_point(aes(fill = sample), size = 3, shape = 21, color = "black", stroke = .5) +
-  scale_x_continuous(limits = c(0, 0.20), breaks = seq(0, 0.20, by = 0.05)) +
-  theme_minimal() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
-p2a
-
-#Plotting mean length (mean_rlen) by period (dates BP)
-p2b<- dt2 %>%
-  mutate(Date_BP = fct_relevel(Date_BP,
-                             "6100","5300","4100","3900","3000", "800")) %>%
-  ggplot(aes(x=mean_rlen, y=Date_BP))+ 
-  geom_boxplot(aes(x=mean_rlen, y=Date_BP, fill = sample)) +
-  geom_point(aes(fill = sample), size = 3, shape = 21, color = "black", stroke = .5) +
-  scale_x_continuous(limits = c(30, 80), breaks = seq(30, 80, by = 10)) +
-  theme_minimal() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
-p2b
-
-#Combining the plots and saving as pdf file
-pdf(file = "figures/p2.pdf", width = 8, height = 6)
-p2 <- grid.arrange(p2a, p2b,
-                   ncol = 2, nrow = 1)
-dev.off()
-``` 
-
-![Amplitude of damage (A) and mean fragment length (mean_rlen) through time.](assets/images/chapters/authentication/p3.png){#fig-authentication-fig9}
-
 ::: {.callout-tip}
 Once finished examining the plots you can quit R
 ```bash
@@ -1038,7 +1026,8 @@ In addition, we:
 
 ## Acknowledgments
 
-We thank Mikkel Winther Pedersen and Antonio Fernandez Guerra for their contribution to the development of the `metaDMG` section.
+We thank Mikkel Winther Pedersen and Antonio Fernandez Guerra for their contribution to the development of the `metaDMG` section. 
+G.Z. would like to thank the European Research Council (ERC) under the European Union’s Horizon 2020 research and innovation programme (Grant Agreement No. 856488, project SEACHANGE).
 
 ## Recommended Reading