Hidden Markov Models

Example and code based on the brilliant tutorial: Hidden Markov Models (HMMs) in R. Using DNA sequence as an example, HMMs are used to assign states to nucleotides. We don’t know the states (they are hidden) and we use HMMs to infer the most likely state. To get started, we will generate a sequence where we know the state. Then we can see how well the HMM performed by comparing the real states with the inferred states.

A HMM can be defined by (A, B, π), where A is a matrix of state transition probabilities, B is a vector of state emission probabilities and π is a vector of initial state distributions. Here’s the state transition probabilities:

my_state <- c("AT-rich", "GC-rich")
at_tran_prob <- c(0.7, 0.3)
gc_tran_prob <- c(0.1, 0.9)
tran_matrix <- matrix(c(at_tran_prob, gc_tran_prob), 2, 2, byrow = TRUE)
rownames(tran_matrix) <- my_state
colnames(tran_matrix) <- my_state
tran_matrix

##         AT-rich GC-rich
## AT-rich     0.7     0.3
## GC-rich     0.1     0.9

Here’s the state emission probabilities:

my_nuc <- c("A", "C", "G", "T")
at_emi_prob <- c(0.39, 0.1, 0.1, 0.41)
gc_emi_prob <- c(0.1, 0.41, 0.39, 0.1)
emi_matrix <- matrix(c(at_emi_prob, gc_emi_prob), 2, 4, byrow = TRUE)
rownames(emi_matrix) <- my_state
colnames(emi_matrix) <- my_nuc
emi_matrix

##            A    C    G    T
## AT-rich 0.39 0.10 0.10 0.41
## GC-rich 0.10 0.41 0.39 0.10

Now to generate a sequence based on our state transition and state emission probabilites.

generatehmmseq <- function(tran_matrix, emi_matrix, init_prob, seq_len){
  new_seq <- character()
  new_seq_state <- character()
  first_state <- sample(my_state, 1, prob=init_prob)
  my_prob <- emi_matrix[first_state,]
  first_nuc <- sample(my_nuc, 1, prob=my_prob)
  new_seq[1] <- first_nuc
  new_seq_state[1] <- first_state
  
  for (i in 2:seq_len){
    prev_state <- new_seq_state[i-1]
    state_probs <- tran_matrix[prev_state,]
    cur_state <- sample(my_state, 1, prob=state_probs)
    my_prob <- emi_matrix[cur_state,]
    nuc <- sample(my_nuc, 1, prob=my_prob)
    new_seq[i] <- nuc
    new_seq_state[i] <- cur_state
  }
  
  my_result <- list(sequence = new_seq, state = new_seq_state)
}

set.seed(31)
init_prob <- c(0.5, 0.5)
my_seq <- generatehmmseq(tran_matrix, emi_matrix, init_prob, 30)
my_seq

## $sequence
##  [1] "C" "A" "G" "C" "T" "G" "T" "C" "G" "C" "G" "G" "C" "T" "C" "G" "G"
## [18] "G" "C" "C" "G" "G" "G" "C" "C" "C" "A" "A" "A" "T"
## 
## $state
##  [1] "AT-rich" "AT-rich" "GC-rich" "GC-rich" "AT-rich" "AT-rich" "AT-rich"
##  [8] "GC-rich" "GC-rich" "GC-rich" "GC-rich" "GC-rich" "GC-rich" "GC-rich"
## [15] "GC-rich" "GC-rich" "GC-rich" "GC-rich" "GC-rich" "GC-rich" "GC-rich"
## [22] "GC-rich" "GC-rich" "GC-rich" "GC-rich" "GC-rich" "AT-rich" "AT-rich"
## [29] "AT-rich" "AT-rich"

Now that we have our sequence, we can see how well the HMM predicts the states.

viterbi <- function(sequence, transitionmatrix, emissionmatrix){
  most_prob_state <- character()
  v <- makeViterbimat(sequence, transitionmatrix, emissionmatrix)
  mostprobablestatepath <- apply(v, 1, function(x) which.max(x))
  prevnucleotide <- sequence[1]
  prevmostprobablestate <- mostprobablestatepath[1]
  prevmostprobablestatename <- my_state[prevmostprobablestate]
  most_prob_state[1] <- my_state[prevmostprobablestate]
  for (i in 2:length(sequence)){
    nucleotide <- sequence[i]
    mostprobablestate <- mostprobablestatepath[i]
    mostprobablestatename <- my_state[mostprobablestate]
    most_prob_state[i] <- my_state[mostprobablestate]
    prevnucleotide <- nucleotide
    prevmostprobablestatename <- mostprobablestatename
  }
  return(most_prob_state)
}

makeViterbimat <- function(sequence, transitionmatrix, emissionmatrix){
  sequence <- toupper(sequence)
  numstates <- dim(transitionmatrix)[1]
  v <- matrix(NA, nrow = length(sequence), ncol = dim(transitionmatrix)[1])
  v[1, ] <- 0
  v[1,1] <- 1
  for (i in 2:length(sequence)){
    for (l in 1:numstates){
      statelprobnucleotidei <- emissionmatrix[l,sequence[i]]
      v[i,l] <- statelprobnucleotidei * max(v[(i-1),] * transitionmatrix[,l])
    }
  }
  return(v)
}

my_most_prob_state <- viterbi(my_seq$sequence, tran_matrix, emi_matrix)
data.frame(nuc = my_seq$sequence, real = my_seq$state, inferred = my_most_prob_state, match = my_seq$state == my_most_prob_state)

##    nuc    real inferred match
## 1    C AT-rich  AT-rich  TRUE
## 2    A AT-rich  AT-rich  TRUE
## 3    G GC-rich  GC-rich  TRUE
## 4    C GC-rich  GC-rich  TRUE
## 5    T AT-rich  GC-rich FALSE
## 6    G AT-rich  GC-rich FALSE
## 7    T AT-rich  GC-rich FALSE
## 8    C GC-rich  GC-rich  TRUE
## 9    G GC-rich  GC-rich  TRUE
## 10   C GC-rich  GC-rich  TRUE
## 11   G GC-rich  GC-rich  TRUE
## 12   G GC-rich  GC-rich  TRUE
## 13   C GC-rich  GC-rich  TRUE
## 14   T GC-rich  GC-rich  TRUE
## 15   C GC-rich  GC-rich  TRUE
## 16   G GC-rich  GC-rich  TRUE
## 17   G GC-rich  GC-rich  TRUE
## 18   G GC-rich  GC-rich  TRUE
## 19   C GC-rich  GC-rich  TRUE
## 20   C GC-rich  GC-rich  TRUE
## 21   G GC-rich  GC-rich  TRUE
## 22   G GC-rich  GC-rich  TRUE
## 23   G GC-rich  GC-rich  TRUE
## 24   C GC-rich  GC-rich  TRUE
## 25   C GC-rich  GC-rich  TRUE
## 26   C GC-rich  GC-rich  TRUE
## 27   A AT-rich  GC-rich FALSE
## 28   A AT-rich  AT-rich  TRUE
## 29   A AT-rich  AT-rich  TRUE
## 30   T AT-rich  AT-rich  TRUE

Published: January 26 2017

category:
machine learning 2