TopPage

TFIDF

  • 1414ファイルのTFIDF
    	#! /usr/bin/env python
    	#encoding: utf-8
    	import nltk
    	import commands
    	
    	#ファイル名取得
    	ls = commands.getoutput('/bin/ls Tw*')
    	
    	#print len(ls)
    	
    	files = ls.split('\n')
    	
    	#print len(files)
    	#for file in files:
    	#       print file
    	
    	lists = []
    	vocab = {}
    	AllWords = []
    	otaruSum = 0
    	#ファイルをリストに格納する
    	for file in files:
    	        print file
    	        list = []
    	        otaru = 0
    	        for line in open(file, 'r'):
    	                words = line[:-1].split(" ")
    	                for word in words:
    	                        list.append(word)
    	                        AllWords.append(word)
    	                        if word == "@":
    	                                otaru = 1
    	                        if vocab.has_key(word):
    	                                vocab[word] += 1
    	                        else:
    	                                vocab[word] =1
    	        lists.append(list)
    	        print "RT" , list.count("RT")
    	        if otaru == 1:
    	                otaruSum += 1
    	        print len(list)
    	
    	A = nltk.TextCollection(lists);
    	
    	print len(lists)
    	print A.tf("RT",AllWords)
    	print "RT:", A.idf("RT")
    	print "QT" ,A.idf("QT")
    	print "小樽:" ,A.idf("小樽")
    	print "東京:" , A.idf("東京")
    	print "札幌:" , A.idf("札幌")
    	print otaruSum
    	
    	for file in files:
    	        list = []
    	        for line in open(file, 'r'):
    	                words = line[:-1].split(" ")
    	                for word in words:
    	                        list.append(word)
    	        tfidf = list.count("RT")*A.idf("RT")
    	        print file , "RT の TF=" , list.count("RT") , "IDF=" , A.idf("RT") , "TFxIDF=" , tfidf

CRF in NLTK