[[TopPage]]
 
 ***TFIDF [#mc607aac]
 -1414ファイルのTFIDF
 	#! /usr/bin/env python
 	#encoding: utf-8
 	import nltk
 	import commands
 	
 	#ファイル名取得
 	ls = commands.getoutput('/bin/ls Tw*')
 	
 	#print len(ls)
 	
 	files = ls.split('\n')
 	
 	#print len(files)
 	#for file in files:
 	#       print file
 	
 	lists = []
 	vocab = {}
 	AllWords = []
 	otaruSum = 0
 	#ファイルをリストに格納する
 	for file in files:
 	        print file
 	        list = []
 	        otaru = 0
 	        for line in open(file, 'r'):
 	                words = line[:-1].split(" ")
 	                for word in words:
 	                        list.append(word)
 	                        AllWords.append(word)
 	                        if word == "@":
 	                                otaru = 1
 	                        if vocab.has_key(word):
 	                                vocab[word] += 1
 	                        else:
 	                                vocab[word] =1
 	        lists.append(list)
 	        print "RT" , list.count("RT")
 	        if otaru == 1:
 	                otaruSum += 1
 	        print len(list)
 	
 	A = nltk.TextCollection(lists);
 	
 	print len(lists)
 	print A.tf("RT",AllWords)
 	print "RT:", A.idf("RT")
 	print "QT" ,A.idf("QT")
 	print "小樽:" ,A.idf("小樽")
 	print "東京:" , A.idf("東京")
 	print "札幌:" , A.idf("札幌")
 	print otaruSum
 	
 	for file in files:
 	        list = []
 	        for line in open(file, 'r'):
 	                words = line[:-1].split(" ")
 	                for word in words:
 	                        list.append(word)
 	        tfidf = list.count("RT")*A.idf("RT")
 	        print file , "RT の TF=" , list.count("RT") , "IDF=" , A.idf("RT") , "TFxIDF=" , tfidf
 
 ***CRF in NLTK [#xb5cc6ef]
 --http://nltk.googlecode.com/svn/trunk/doc/api/nltk.tag.crf.MalletCRF-class.html#__init__