source("/Users/crs/cmu-html/462/homework/03/solutions-03.R") # Available online from # http://www.stat.cmu.edu/~cshalizi/462/homework/03/solutions-03.R require(entropy) # Take a sequence and replace the values by their frequency ranks # Input: vector (x) # Output: vector where each value is replaced by # its rank in terms of frequency symbols.2.ranks <- function(x) { # Get the symbols in order of decreasing # frequency s = names(sort(table(x),decreasing=TRUE)) # Remove quotes! s <- as.numeric(s) # Copy the input y = x for (i in 1:length(s)) { # Find the locations where x has the given # value, replace it with the rank y[x == s[i]] = i } return(y) } # Filter a sequence to include only the K most # common values # Input: vector (x), number of values (rank.max) # Output: vector of values # Calls: symbols.2.ranks most.frequent <- function(x,rank.max) { s <- symbols.2.ranks(x) # Get the rank of everything # Pick out the positions where ranks are less # than or equal to the max y <- x[s <= rank.max] return(y) } ##### Actual simulation ## Option 1: geometrically-distributed words # x= (rgeom(1e5,0.075)+1) ## Option 2: # Make up geometrically-distributed "words" # Then give them Markovianly-distributed +-1 signs # x= (rgeom(1e5,0.075)+1)*(2*rbinmarkov(1e5,0.01,0.99)-1) # rbinmarkov is from solutions-03.R --- it produces # {0,1} valued Markov chain paths, with the first # argument being the number of symbols, followed # by the probabilities of 1 following 0 and 1 # respectively. Here it's set to be quite persistent! ## Option 3: # Geometrically-distributed words # divided into blocks of length runif(2,6), with Rademacher signs # on each block x = rgeom(1e5,0.075)+1 block.lens = sample(2:6,1e5/2,replace=TRUE) # Since each block is at least length 2, this definitely covers # the sequence block.signs = 2*rbinom(1e5/2,1,0.5)-1 # Rademacher random variables signs = rep(block.signs,block.lens) signs = signs[1:1e5] # Get rid of the excess x = x*signs # Follow the procedure used in the "Indus" paper: # filter the original sequence so it contains only # the N most common terms, fit a Markov chain # to the filtered sequence, and report the # conditional entropy of the chain as a function # of N. Conditional entropy = negative log # likelihood per symbol, of course. # The "Indus" paper used a smoothed estimate # of conditional probabilities, while I'm # doing straight MLE for simplicity, but this # shouldn't matter much with large sequence # length. K.max=60 # Maximum number of values ents = vector(length=(K.max-1)) condents = vector(length=(K.max-1)) for (k in 2:K.max) { y = most.frequent(x,k) condents[k-1] = -markov.mle.1(y)$log.like/length(y) # Markov chain estimator from solutions-03.R ents[k-1] = entropy(table(y)) } y.max = max(c(ents,condents)) plot(2:K.max,ents,xlab="N",ylab="Entropy in nats",ylim=c(0,y.max),type="b",pch="+") lines(2:K.max,condents,type="b")