您的浏览器不支持javascript，不能使用此页面的全部功能。请换用其他浏览器或者开启对javascript的支持。

普林斯顿 vs. 脸书：社会网络增长预测的病毒扩散模型

苏格拉底大王 2014-02-20 09:24:31

Princeton vs. Facebook: modeling contagion
本文来自http://blog.revolutionanalytics.com/2014/02/princeton-vs-facebook-modeling-contagion.html
普林斯顿的研究者在arxiv上传了其研究，题为Epidemiological modeling of online social network dynamics。根据其模型，脸书将在2015-2017年丧失百分之八十的用户。

脸书的数据科学家Facebook data scientists同样使用google trends（还有google scholar中数据）戏谑地说普林斯顿将在2021年失去所有的学生。google scholar数据表明就论文发表数量而言，普林斯顿2000年之后出现下滑：

google scholar

普林斯顿学生的入学人数与google trends数据高度相关（如下图），而近年来google trends中的普林斯顿的搜索却不断下降：

当然，脸书和普林斯顿虽然都面对新的威胁（脸书vs.新的社交网络网站，普林斯顿vs.大规模在线公开课）。但未必就会如简单的模型语言一般迅速衰落。除了数据来源需谨慎之外，其中一个重要的原因是传染模型的潜在假设。这里介绍使用SIR模型拟合Myspace的google trends数据。
SIR描述了疾病的三种状态：susciptibles -->infecteds-->recovereds

首先假设一个群体中的每个人具有相同的概率接触到“病毒”，定义感染率（infection rate）为beta。同时假设感染之后的群体中的每个人以相同的概率恢复健康，并定义复原率‘gamma’ (recovery rate)。可以得到以下微分方程：
dS/dt = - beta * S * I
dI/dt = beta*S*I - gamma*I
dR/dt = gamma*I
作者使用“deSolve”来模拟这个感染过程，并通过“optim”函数来优化. 这样就得到模型的拟合曲线：

感觉不如普林斯顿文中拟合效果好

以下是实现该过程的R代码：
install.packages(c("deSolve", "RCurl","stringr", "ggplot2", "plyr"))
library(deSolve)
library(ggplot2)
library(plyr)
library(RCurl) # For getURL() and curl handler / cookie / google login
library(stringr) # For str_trim() to trip whitespace from strings
# Credits/Inspiration
# http://christophriedl.net/2013/08/22/google-trends-with-r/
# http://www.samsi.info/
# Google account settings
#username <- "YOUR_NAME@gmail.com"
#password <- "YOUR_PASSWORD"
# URLs
loginURL <- "https://accounts.google.com/accounts/ServiceLogin"
authenticateURL <- "https://accounts.google.com/accounts/ServiceLoginAuth"
trendsURL <- "http://www.google.com/trends?"
## This gets the GALX cookie which we need to pass back with the login form
getGALX <- function(curl) {
  txt = basicTextGatherer()
  curlPerform( url=loginURL, curl=curl, writefunction=txt$update, header=TRUE, ssl.verifypeer=FALSE )

  tmp <- txt$value()

  val <- grep("Cookie: GALX", strsplit(tmp, "\n")［1］, val = TRUE)
  strsplit(val, "[:=;]")［1］[3]

  return( strsplit( val, "[:=;]")［1］[3])
}
## Function to perform Google login and get cookies ready
gLogin <- function(username, password) {
  ch <- getCurlHandle()

  ans <- (curlSetOpt(curl = ch,
                     ssl.verifypeer = FALSE,
                     useragent = getOption('HTTPUserAgent', "R"),
                     timeout = 60,
                     followlocation = TRUE,
                     cookiejar = "./cookies",
                     cookiefile = ""))

  galx <- getGALX(ch)
  authenticatePage <- postForm(authenticateURL, .params=list(Email=username, Passwd=password, GALX=galx, PersistentCookie="yes", continue="http://www.google.com/trends"), curl=ch)

  authenticatePage2 <- getURL("http://www.google.com", curl=ch)

  if(getCurlInfo(ch)$response.code == 200) {
    print("Google login successful!")
  } else {
    print("Google login failed!")
  }
  return(ch)
}
## Function to query a search string
gQuery <-function(username, password, queryString){
  ch <- gLogin( username, password )
  authenticatePage2 <- getURL("http://www.google.com", curl=ch)
  res <- getForm(trendsURL, q=queryString, content=1, export=1, graph="all_csv", curl=ch)
  #str(res)
  # Check if quota limit reached
  if( grepl( "You have reached your quota limit", res ) ) {
    stop( "Quota limit reached; You should wait a while and try again lateer" )
  }

  # Parse resonse and store in CSV
  # We skip ther first few rows which contain the Google header; we then read 500 rows up to the current date
  x <- try( read.table(text=res, sep=",", col.names=c("Week", "TrendsCount"), skip=32, nrows=500) )
  return(x)
}
myspaceData <-gQuery(username, password, "myspace")
qplot(Week,TrendsCount,data=myspaceData)
fbData<-gQuery(username, password, "facebook")
qplot(Week,TrendsCount,data=fbData)
Propagation<-myspaceData$TrendsCount
# 以上是获取myspace的google trends数据的在R中实现的方法，其实你可以直接在google trends当中搜索数据并下载为csv。
# 下面才是定义SIR函数的方法：
#first need an R function defining the SIR model
SIRmodel <- function(t, x, params) {
  with(as.list(c(params, x)), {
    dS <- -b*S*I
    dI <- b*S*I - g*I
    dR <- g*I
    res <- c(dS, dI, dR)
    list(res)
  })
}
## Parameters
## (Just pick some numbers)
params <- c(b=0.5, g=0.5)
## vector of time steps
times <- seq(1, 500, by=1)
## Initial conditions
IC <- c(S=99.99, I=0.01, R=0)
## Solving ODE
Output <- lsoda(IC, times, SIRmodel, params)
plot(Output)
#function that measures distance between solution of SIRmodel and actual incidence data
SIRdist <- function(x){
  #x should have 3 entries
  #x[1]=b
  #x[2]=g
  #x[3]=Initial number of infected O to 100

  GhostParameters <- c(b=x[1], g=x[2])
  InitialCondition <- c(S=100-.01, I=.01, R=0)

  Ghost<-lsoda(InitialCondition,times,SIRmodel,GhostParameters)

  a<-Ghost[,"I"]


  Dist<-sum((Propagation-a)^2)

  Dist
}
#Optim has the tendency to find a local minimum instead of global, hence zeroing in on the solution space first before running optim
beta.in <- seq(0.000, 0.0019, by=.0001)
gamma.in <- seq(0.004, 0.0059, by=.0001)
n<-1
SIRDistMat<-array(1:400, dim=c(20,20))
for (i in beta.in){
  m<-1
  for(j in gamma.in){
    SIRDistMat[n,m] <- SIRdist(c(i,j, 0.01))
    m<-m+1
  }
  n<-n+1
}
newStartingPoint <- arrayInd(which.min(SIRDistMat), dim(SIRDistMat))
# New Starting Point
y <- c(beta.in[newStartingPoint[1］,gamma.in[newStartingPoint[2］)
# Use trace<-3 if you'd like to see what's going on
X<-optim(y,SIRdist)
a<-X$par
OptimalParams <- c(b=a[1], g=a[2])
IC <- c(S=99.99, I=.01, R=0) # same IC as in SIRdist()
Solution <- lsoda(IC,times, SIRmodel, OptimalParams)
df <- data.frame(t(rbind(myspaceData$Week,myspaceData$TrendsCount,Solution[,"I"] )))
names(df)[1] <-"Week"
names(df)[2] <-"Actual"
names(df)[3] <- "Model"
ggplot() +
  geom_point(data = df, aes(Week, Actual)) +
  geom_point(data = df, aes(Week, Model), colour = "red") +ylab("Model V/s Actual")

> 我来回应

普林斯顿 vs. 脸书：社会网络增长预测的病毒扩散模型

计算传播学的日记 ( 全部 )

推荐这篇日记的豆列 ( 全部 )