ダミーデータで試す - ゼロからの学習日記

ryamada2222222012-12-26

ダミーデータを作ってみる(数行くらいのファイルが200個)
Rでの描図は重くて無理なので

write.graph(g.plus, "test.graphml", format="graphml")

として、Gephi(こちら)に開かせてみる（ノード数：2283、エッジ数：5188)

その上で適当に２ノードを選んで、その最短パスとそこからの距離が３のノードのみのサブグラフを(これはRで)描かせてみる

n.char <- 2
n.files <- 200
infiles <- c()

infile.list <- list()
for(i in 1:n.files){
	infiles <- c(infiles,paste(sample(letters,n.char),collapse=""))
	n.line <- sample(4:20,1)
	n.word <- 5
	n.cols <- 4
	names <- c()
	for(j in 1:n.word){
		names <- c(names,paste(sample(letters,n.char),collapse=""))
	}
	tmp <- matrix(NA,nrow=n.line,ncol=n.cols)
	for(j in 1:n.line){
		tmp.word <- sample(2:n.cols,1)
		tmp[j,1:tmp.word] <- sample(names,tmp.word)
	}
	infile.list[[i]] <- tmp
}

g <- graph.empty(directed = FALSE)
g.plus <- graph.empty(directed = FALSE)
v.list <- list()
v.plus.list <- list()
e.list <- list()
e.plus.list <- list()
for(i in 1:length(infile.list)){
	infile <- infile.list[[i]]
	# 行列の方が好きなので行列にする
	infile.m <- as.matrix(infile)

	# エッジに関係するところだけを取り出す
	#infile.m <- infile.m[,1:length(infile.m[1,])]
	# ノードをユニークにする
	unique.word <- unique(c(infile.m))
	unique.word <- unique.word[which(unique.word != "")]
	# ノードの名前に順序idをつける
	v.list[[i]] <- NULL
	v.list[[i]] <- unique.word
	e.list[[i]] <- matrix(0,0,2)
	v.plus.list[[i]] <- unique.word
	e.plus.list[[i]] <- matrix(0,0,2)
	#g <- g + vertices(v.name)

	# 行ごとに要素数を数えて
	for(j in 1:length(infile.m[,1])){
		num.kids <- length(which(infile.m[j,] != ""))-1
		if(num.kids>=1){
			for(k in 1:num.kids){
				#g <- g + edges(c(infile.m[i,1],infile.m[i,j+1]))
				e.list[[i]] <- rbind(e.list[[i]],c(infile.m[j,1],infile.m[j,k+1]))
			}
		}
		
	}
	sorted.e.list <- t(apply(e.list[[i]],1,sort))
	tmp.v <- paste(sorted.e.list[,1],sorted.e.list[,2],sep="")
	v.plus.list[[i]] <- c(unique.word,infiles[i],tmp.v)
	v.plus.list[[i]] <- unique(v.plus.list[[i]])
	#e.plus.list[[i]] <- e.list[[i]]
	for(j in 1:length(e.list[[i]][,1])){
		e.plus.list[[i]] <- rbind(e.plus.list[[i]],c(e.list[[i]][j,1],tmp.v[j]))
		e.plus.list[[i]] <- rbind(e.plus.list[[i]],c(e.list[[i]][j,2],tmp.v[j]))
		e.plus.list[[i]] <- rbind(e.plus.list[[i]],c(tmp.v[j],infiles[i]))
	}
	#plot(g,vertex.label=V(g)$name)

}
unique.v <- unique(unlist(v.list))
unique.plus.v <- unique(unlist(v.plus.list))
g <- graph.empty(directed=FALSE) + vertices(unique.v)
g.plus <- graph.empty(directed=FALSE) + vertices(unique.plus.v)
for(i in 1:length(infile.list)){
	for(j in 1:length(e.list[[i]][,1])){
		g <- g + edges(e.list[[i]][j,])
	}
	for(j in 1:length(e.plus.list[[i]][,1])){
		g.plus <- g.plus + edges(e.plus.list[[i]][j,])
	}
}
#plot(g,vertex.label=V(g)$name,vertex.size=3,edge.arrow.mode=0)
#dev.new()
#plot(g.plus,vertex.label=V(g.plus)$name,vertex.size=3,edge.arrow.mode=0)

# 全ノードの最短距離は求めておく
sh.paths.mat <- shortest.paths(g.plus)
# その上で、

show.neighbors <- function(g,vs,L=1){
	tmp <- (sh.paths.mat[vs,] <= L)
	if(length(vs)>1){
		neighbors <- sign(apply(tmp,2,sum))
	}else{
		neighbors <- sign(tmp)
	}
	
	tmp.g <- induced.subgraph(g,which(neighbors==1))
	plot(tmp.g,vertex.label=V(tmp.g)$name,vertex.size=3,edge.arrow.mode=0)

}
# グラフ全体g.plusにおいてある点と距離１以内にあるノードで構成されたサブグラフを表示する
show.neighbors(g.plus,V(g.plus)$name[10],L=1)

# 複数ノードを指定して、そのノード集合からの距離が指定距離以内のノードで構成されたサブグラフを表示する
show.neighbors.series <- function(g.plus,vs,Ls=0:4){
	par(ask=TRUE)
	for(i in Ls){
		show.neighbors(g.plus,vs,L=i)
	}
	par(ask=FALSE)
}

# ２ノードを指定して、その最短パスのノード列を列挙し、
two <- c(V(g.plus)$name[10],V(g.plus)$name[100])
sh.paths <- get.shortest.paths(g.plus,two[1],two[2])
# そのパスのノードをノード集合として、近隣ノードのサブグラフを表示する
show.neighbors.series(g.plus,unlist(sh.paths),Ls=0:3)