voice vignette

version 0.5.2

Filipe J. Zabala

2025-07-11

0. Basic installation

# Development version from GitHub
devtools::install_github('filipezabala/voice')
# Stable from CRAN
install.packages('voice')

More details at https://github.com/filipezabala/voice.

1. Extract features

1.1 Load packages and audio files

# packs
library(voice)
library(tidyverse)

# get path to audio file
wavDir <- list.files(system.file('extdata', package = 'wrassp'),
                     pattern = glob2rx('*.wav'), full.names = TRUE)

1.2 Extract features

# minimal usage
M <- voice::extract_features(wavDir)
glimpse(M)
#> Rows: 1,196
#> Columns: 13
#> $ section_seq      <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
#> $ section_seq_file <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
#> $ wav_path         <chr> "/Library/Frameworks/R.framework/Versions/4.5-arm64/R…
#> $ f0               <dbl> NA, NA, NA, NA, NA, NA, NA, 115.8593, 108.9439, 107.4…
#> $ f1               <int> NA, NA, NA, NA, 185, 260, 254, 277, 261, 231, 177, 19…
#> $ f2               <int> 1854, 1886, 1749, 1888, 1962, 1973, 2026, 2037, 2130,…
#> $ f3               <int> NA, 2893, 2676, 2659, 2639, 2676, 2993, 2932, 3016, 2…
#> $ f4               <int> 3113, 3708, 3509, 3658, 3248, 3239, 3830, 3479, 3561,…
#> $ f5               <int> 4191, 4678, 4502, 4331, 3653, 3836, 4602, 4585, 4720,…
#> $ f6               <int> 5226, 5659, 5035, 5177, 5208, 5146, 5233, 5390, 5366,…
#> $ f7               <int> 6077, 6725, 6526, 6518, 6493, 6567, 6603, 6532, 6510,…
#> $ f8               <int> 6675, NA, NA, NA, 7681, 7751, 7803, NA, 7835, 7614, 7…
#> $ gain             <dbl> 21.63347, 22.76034, 28.52825, 29.67069, 36.25124, 43.…

2. Tag

# creating Extended synthetic data
E <- dplyr::tibble(subject_id = c(1,1,1,2,2,2,3,3,3), wav_path = wavDir)
E
#> # A tibble: 9 × 2
#>   subject_id wav_path                                                           
#>        <dbl> <chr>                                                              
#> 1          1 /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/libra…
#> 2          1 /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/libra…
#> 3          1 /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/libra…
#> 4          2 /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/libra…
#> 5          2 /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/libra…
#> 6          2 /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/libra…
#> 7          3 /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/libra…
#> 8          3 /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/libra…
#> 9          3 /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/libra…

# minimal usage
voice::tag(E)
#> # A tibble: 9 × 7
#>   wav_path   f0_tag_mean f0_tag_sd f0_tag_vc f0_tag_median f0_tag_iqr f0_tag_mad
#>   <chr>            <dbl>     <dbl>     <dbl>         <dbl>      <dbl>      <dbl>
#> 1 /Library/…        85.4      17.6     0.206          76.1       29.4       7.53
#> 2 /Library/…        85.4      15.6     0.183          80.1       27.8      14.4 
#> 3 /Library/…        84.6      13.0     0.154          78.8       23.9      14.0 
#> 4 /Library/…        84.8      14.5     0.171          79.1       28.1      11.9 
#> 5 /Library/…        86.0      14.7     0.170          78.7       30.0      11.0 
#> 6 /Library/…        82.9      15.6     0.188          74.8       23.8       4.78
#> 7 /Library/…        78.2      16.2     0.207          73.5       13.4       6.82
#> 8 /Library/…        84.5      14.5     0.172          78.1       17.8       8.95
#> 9 /Library/…        81.0      12.2     0.151          75.9       23.1       9.14

# canonical data
voice::tag(E, groupBy = 'subject_id')
#> # A tibble: 3 × 7
#>   subject_id f0_tag_mean f0_tag_sd f0_tag_vc f0_tag_median f0_tag_iqr f0_tag_mad
#>        <dbl>       <dbl>     <dbl>     <dbl>         <dbl>      <dbl>      <dbl>
#> 1          1        85.1      15.3     0.180          78.3       26.8      11.9 
#> 2          2        84.6      14.9     0.176          76.4       28.3       7.97
#> 3          3        81.0      14.6     0.180          75.6       21.6       8.68

3. Visualization

3.1 Get audio

url0 <- 'https://github.com/filipezabala/voiceAudios/raw/refs/heads/main/wav/doremi.wav'
download.file(url0, paste0(tempdir(), '/doremi.wav'), mode = 'wb')
voice::embed_audio(url0) # See https://github.com/mccarthy-m-g/embedr for more details.

3.2 Media data

M <- voice::extract_features(tempdir())
summary(M)
#>   section_seq    section_seq_file   wav_path               f0       
#>  Min.   :  1.0   Min.   :  1.0    Length:591         Min.   :121.3  
#>  1st Qu.:148.5   1st Qu.:148.5    Class :character   1st Qu.:147.8  
#>  Median :296.0   Median :296.0    Mode  :character   Median :204.8  
#>  Mean   :296.0   Mean   :296.0                       Mean   :208.7  
#>  3rd Qu.:443.5   3rd Qu.:443.5                       3rd Qu.:259.6  
#>  Max.   :591.0   Max.   :591.0                       Max.   :339.1  
#>                                                      NA's   :111    
#>        f1               f2             f3             f4             f5      
#>  Min.   : 175.0   Min.   : 662   Min.   :1802   Min.   :2775   Min.   :3857  
#>  1st Qu.: 397.0   1st Qu.:1061   1st Qu.:2510   1st Qu.:3350   1st Qu.:4296  
#>  Median : 550.0   Median :1266   Median :2630   Median :3612   Median :4452  
#>  Mean   : 539.1   Mean   :1347   Mean   :2626   Mean   :3563   Mean   :4458  
#>  3rd Qu.: 658.0   3rd Qu.:1609   3rd Qu.:2717   3rd Qu.:3764   3rd Qu.:4572  
#>  Max.   :1421.0   Max.   :3230   Max.   :3940   Max.   :4603   Max.   :5425  
#>  NA's   :30       NA's   :19     NA's   :17     NA's   :15     NA's   :20    
#>        f6             f7             f8            gain      
#>  Min.   :4501   Min.   :5173   Min.   :6632   Min.   :10.89  
#>  1st Qu.:5164   1st Qu.:6381   1st Qu.:7287   1st Qu.:19.34  
#>  Median :5416   Median :6504   Median :7458   Median :21.39  
#>  Mean   :5394   Mean   :6496   Mean   :7467   Mean   :21.48  
#>  3rd Qu.:5555   3rd Qu.:6622   3rd Qu.:7628   3rd Qu.:23.24  
#>  Max.   :6357   Max.   :7618   Max.   :8255   Max.   :34.34  
#>  NA's   :15     NA's   :9      NA's   :51

3.3 Plot

voice::piano_plot(M, 0) # f0

voice::piano_plot(M, 0:1) # f0 + f1

3.4 Assign notes

(f0_spn <- voice::assign_notes(M, fmt = 0, min_points = 22, min_percentile = .85)) # f0
#> [1] C3 D3 E4 E3 G3 A3 B3 C4
#> 108 Levels: C0 C#0 D0 D#0 E0 F0 F#0 G0 G#0 A0 A#0 B0 C1 C#1 D1 D#1 E1 F1 ... B8
(f1_spn <- voice::assign_notes(M, fmt = 1, min_points = 22, min_percentile = .85)) # f1
#> [1] B4  C#5 E4  D5  D5  F5  F4  D#5
#> 108 Levels: C0 C#0 D0 D#0 E0 F0 F#0 G0 G#0 A0 A#0 B0 C1 C#1 D1 D#1 E1 F1 ... B8

3.5 Sheet music

Must have MuseScore and gm.

3.5.1 Notes sequence of f0

library(gm)
line_0 <- gm::Line(as.character(f0_spn))
m0 <- gm::Music() +
  gm::Meter(4, 4) +
  line_0
gm::show(m0, to = c('score', 'audio'))

3.5.2 Notes sequences of f0 and f1

line_0 <- gm::Line(as.character(f0_spn))
line_1 <- gm::Line(as.character(f1_spn))
m1 <- gm::Music() +
  gm::Meter(4, 4) +
  line_0 + line_1
gm::show(m1, to = c('score', 'audio'))

4. Advanced installation

Python-based functions diarize and extract_features (when the latter is inferring f0_praat and fmt_praat features) require a configured Python environment.

4.1 Ubuntu

The following steps are used to fully configure voice on Ubuntu 24.04 LTS (Noble Numbat). Reports of inconsistencies are welcome.

4.1.1. Curl

Command line tool and library for transferring data with URLs.

# installing dependencies
sudo apt-get update
sudo apt-get install -y libssl-dev autoconf libtool make
# installing curl
sudo apt install curl
# verify installation
curl --version

4.1.2. ffmpeg

ffmpeg is a cross-platform solution to record, convert and stream audio and video.

sudo apt-get update
sudo apt-get install ffmpeg

4.1.3. Audio drivers and extra packages

sudo apt-get update
sudo apt-get install portaudio19-dev libasound2-dev libfontconfig1-dev libmagick++-dev libxml2-dev libharfbuzz-dev libfribidi-dev libgdal-dev cmake cmake-doc ninja-build

4.1.4. MuseScore

MuseScore is an open source notation software.

sudo add-apt-repository ppa:mscore-ubuntu/mscore-stable
sudo apt-get update
sudo apt-get install musescore

4.1.5. R

R is a free software environment for statistical computing and graphics. To find out your Ubuntu distribution use lsb_release -a at terminal.

sudo sh -c 'echo "deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/" >> /etc/apt/sources.list.d/cran.list'
sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E084DAB9 
sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 51716619E084DAB9
gpg -a --export E084DAB9 | sudo apt-key add -

sudo add-apt-repository ppa:c2d4u.team/c2d4u4.0+

sudo apt-get update && sudo apt-get upgrade
sudo apt-get install r-base r-base-dev

4.1.6. RStudio

RStudio is an Integrated Development Environment (IDE) for R. Check for updates here.

sudo apt-get update
sudo apt-get install gdebi-core
wget https://download1.rstudio.org/electron/jammy/amd64/rstudio-2025.05.0-496-amd64.deb
sudo gdebi rstudio-2025.05.0-496-amd64.deb

4.1.9. R packages

“Packages are the fundamental units of reproducible R code.” Hadley Wickham and Jennifer Bryan. The installation may take several minutes. At terminal run:

sudo R

Running R as super user paste the following, row by row:

packs <- c('audio','reticulate','R.utils','seewave','tidyverse','tuneR','wrassp')
install.packages(packs, dep = TRUE)
update.packages(ask = FALSE)
devtools::install_github('egenn/music')
devtools::install_github('flujoo/gm')

To configure the gm package.

usethis::edit_r_environ()

Add the line MUSESCORE_PATH=/usr/bin/mscore to /root/.Renviron file. To exit use :wq at VI. Save and restart the R/RStudio session.

4.1.10. Miniconda

Miniconda is a free minimal installer for conda, an open source package, dependency and environment management system for any language—Python, R, Ruby, Lua, Scala, Java, JavaScript, C/ C++, FORTRAN and more, that runs on Windows, macOS and Linux.
Follow the instructions at https://docs.conda.io/en/latest/miniconda.html.

At terminal:

cd ~/Downloads/
wget -r -np -k https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
cd repo.anaconda.com/miniconda/
bash Miniconda3-latest-Linux-x86_64.sh

Do you accept the license terms? [yes|no] yes.

Miniconda3 will now be installed into this location: /home/user/miniconda3 [ENTER]

You can undo this by running conda init --reverse $SHELL? yes

Do you wish the installer to initialize Miniconda3 by running conda init? yes.

Close and reopen terminal.

conda update -n base -c defaults conda

The following packages will be INSTALLED/REMOVED/UPDATED/DOWNGRADED:… Proceed ([y]/n)? y

conda create -n pyvoice python=3.12

The following (NEW) packages will be downloaded/INSTALLED:… Proceed ([y]/n)? y

conda activate pyvoice
pip install -r https://raw.githubusercontent.com/filipezabala/voice/master/requirements.txt

4.2 MacOS

The following steps are used to fully configure voice on MacOS Sonoma (Link to MacOS Sequoia). Reports of inconsistencies are welcome.

4.2.1. Homebrew

Install Homebrew, ‘The Missing Package Manager for macOS (or Linux)’ and remember to brew doctor eventually. At terminal (command + space 'terminal') run:

/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"

4.2.2. wget

GNU Wget is a free software package for retrieving files using HTTP, HTTPS, FTP and FTPS, the most widely used Internet protocols. It is a non-interactive commandline tool, so it may easily be called from scripts, cron jobs, terminals without X-Windows support, etc.

brew install wget

4.2.3. Python

Python is a programming language that integrate systems. According to this post, it is recommended to install Python 3.8 and 3.9 and make it consistent.

brew install python@3.12
python3 --version 
pip3 --version

4.2.4. ffmpeg

ffmpeg is a cross-platform solution to record, convert and stream audio and video. The installation may take several minutes.

brew install ffmpeg

4.2.5. XQuartz

The XQuartz project is an open-source effort to develop a version of the X.Org X Window System that runs on macOS.

4.2.7. tcllib

sudo port selfupdate && sudo port upgrade tcllib
sudo port install tcllib

4.2.8. MuseScore

MuseScore is an open source notation software.

4.2.9. R

R is a free software environment for statistical computing and graphics.

4.2.10. RStudio

RStudio is an Integrated Development Environment (IDE) for R.

4.2.11. R packages

“Packages are the fundamental units of reproducible R code.” Hadley Wickham and Jennifer Bryan. Type command + space 'terminal'

sudo R

Running R as super user paste the following, one line at a time.

packs <- c('audio','reticulate','R.utils','seewave','tidyverse','tuneR','wrassp')
install.packages(packs, dep = TRUE)
update.packages(ask = FALSE)
devtools::install_github('egenn/music')
devtools::install_github('flujoo/gm')

4.2.12. Miniconda

Miniconda is a free minimal installer for conda, an open source package, dependency and environment management system for any language—Python, R, Ruby, Lua, Scala, Java, JavaScript, C/ C++, FORTRAN and more, that runs on Windows, macOS and Linux.

For 64-bit version use

cd ~/Downloads
wget -r -np -k https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
cd repo.anaconda.com/miniconda/
bash Miniconda3-latest-MacOSX-x86_64.sh

For M1 version use

cd ~/Downloads
wget -r -np -k https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-arm64.sh
cd repo.anaconda.com/miniconda/
bash Miniconda3-latest-MacOSX-arm64.sh

In order to continue the installation process, please review the license agreement. Please, press ENTER to continue ENTER.

You can undo this by running conda init --reverse $SHELL? yes

Close and reopen terminal.

export PATH="~/miniconda3/bin:$PATH"
conda update -n base -c defaults conda

The following packages will be INSTALLED/REMOVED/UPDATED/DOWNGRADED:… Proceed ([y]/n)? y

conda create -n pyvoice python=3.12

The following (NEW) packages will be downloaded/INSTALLED:… Proceed ([y]/n)? y

Close and reopen terminal.

conda activate base
conda activate pyvoice
pip install -r https://raw.githubusercontent.com/filipezabala/voice/master/requirements.txt

5. Diarize

# download
url0 <- 'https://github.com/filipezabala/voiceAudios/raw/main/wav/sherlock0.wav'
wavDir <- normalizePath(tempdir())
download.file(url0, paste0(wavDir, '/sherlock0.wav'), mode = 'wb')

Diarization can be performed to detect speaker segments (i.e., ‘who spoke when’).

# diarize
voice::diarize(fromWav = wavDir, toRttm = wavDir, token = 'YOUR_TOKEN')
#> Time difference of 24.49055 secs

The voice::diarize() function creates Rich Transcription Time Marked (RTTM)1 files, space-delimited text files containing one turn per line defined by NIST - National Institute of Standards and Technology. The RTTM files can be read using voice::read_rttm().

# read_rttm
(rttm <- voice::read_rttm(wavDir))
#> $doremi.rttm
#>      type   file chnl  tbeg  tdur ortho stype       name conf slat
#> 1 SPEAKER doremi    1 0.031 5.805  <NA>  <NA> SPEAKER_00 <NA> <NA>
#> 
#> $sherlock0.rttm
#>      type      file chnl   tbeg  tdur ortho stype       name conf slat
#> 1 SPEAKER sherlock0    1  0.908 5.231  <NA>  <NA> SPEAKER_00 <NA> <NA>
#> 2 SPEAKER sherlock0    1  6.933 6.463  <NA>  <NA> SPEAKER_00 <NA> <NA>
#> 3 SPEAKER sherlock0    1 13.565 8.674  <NA>  <NA> SPEAKER_00 <NA> <NA>

Finally, the audio waves can be automatically segmented.

# split audio wave
voice::splitw(fromWav = wavDir, fromRttm = wavDir, to = wavDir)
#> TOTAL TIME 0.262 SECONDS
dir(wavDir, pattern = '.[Ww][Aa][Vv]$')
#> [1] "doremi_split_1.wav"    "doremi.wav"            "sherlock0_split_1.wav"
#> [4] "sherlock0_split_2.wav" "sherlock0_split_3.wav" "sherlock0.wav"

  1. See Appendix C at https://www.nist.gov/system/files/documents/itl/iad/mig/KWS15-evalplan-v05.pdf.↩︎