source("/Users/crs/cmu-html/462/homework/03/solutions-03.R")
# Available online from
# http://www.stat.cmu.edu/~cshalizi/462/homework/03/solutions-03.R
require(entropy)
# Take a sequence and replace the values by their frequency ranks
# Input: vector (x)
# Output: vector where each value is replaced by
# its rank in terms of frequency
symbols.2.ranks <- function(x) {
# Get the symbols in order of decreasing
# frequency
s = names(sort(table(x),decreasing=TRUE))
# Remove quotes!
s <- as.numeric(s)
# Copy the input
y = x
for (i in 1:length(s)) {
# Find the locations where x has the given
# value, replace it with the rank
y[x == s[i]] = i
}
return(y)
}
# Filter a sequence to include only the K most
# common values
# Input: vector (x), number of values (rank.max)
# Output: vector of values
# Calls: symbols.2.ranks
most.frequent <- function(x,rank.max) {
s <- symbols.2.ranks(x) # Get the rank of everything
# Pick out the positions where ranks are less
# than or equal to the max
y <- x[s <= rank.max]
return(y)
}
##### Actual simulation
## Option 1: geometrically-distributed words
# x= (rgeom(1e5,0.075)+1)
## Option 2:
# Make up geometrically-distributed "words"
# Then give them Markovianly-distributed +-1 signs
# x= (rgeom(1e5,0.075)+1)*(2*rbinmarkov(1e5,0.01,0.99)-1)
# rbinmarkov is from solutions-03.R --- it produces
# {0,1} valued Markov chain paths, with the first
# argument being the number of symbols, followed
# by the probabilities of 1 following 0 and 1
# respectively. Here it's set to be quite persistent!
## Option 3:
# Geometrically-distributed words
# divided into blocks of length runif(2,6), with Rademacher signs
# on each block
x = rgeom(1e5,0.075)+1
block.lens = sample(2:6,1e5/2,replace=TRUE)
# Since each block is at least length 2, this definitely covers
# the sequence
block.signs = 2*rbinom(1e5/2,1,0.5)-1 # Rademacher random variables
signs = rep(block.signs,block.lens)
signs = signs[1:1e5] # Get rid of the excess
x = x*signs
# Follow the procedure used in the "Indus" paper:
# filter the original sequence so it contains only
# the N most common terms, fit a Markov chain
# to the filtered sequence, and report the
# conditional entropy of the chain as a function
# of N. Conditional entropy = negative log
# likelihood per symbol, of course.
# The "Indus" paper used a smoothed estimate
# of conditional probabilities, while I'm
# doing straight MLE for simplicity, but this
# shouldn't matter much with large sequence
# length.
K.max=60 # Maximum number of values
ents = vector(length=(K.max-1))
condents = vector(length=(K.max-1))
for (k in 2:K.max) {
y = most.frequent(x,k)
condents[k-1] = -markov.mle.1(y)$log.like/length(y)
# Markov chain estimator from solutions-03.R
ents[k-1] = entropy(table(y))
}
y.max = max(c(ents,condents))
plot(2:K.max,ents,xlab="N",ylab="Entropy in nats",ylim=c(0,y.max),type="b",pch="+")
lines(2:K.max,condents,type="b")