#!/usr/bin/env python # coding: utf8 # -*- coding: utf-8 -*- # Quelle: http://rdrpostagger.sourceforge.net/ # python -m nltk.downloader all # pip install -U nltk # apt-get install python-numpy python-scipy python-matplotlib ipython ipython-notebook python-pandas python-sympy python-nose # pip install -U numpy import datetime import pymongo import polyglot from polyglot.text import Text, Word import fileinput from textblob_de import TextBlobDE as TextBlob from textblob_de import PatternParser from textblob import Blobber from textblob.taggers import NLTKTagger from textblob.tokenizers import SentenceTokenizer from textblob_de import BlobberDE from textblob_de.taggers import PatternTagger from textblob_de.tokenizers import PatternTokenizer from textblob_de import PatternParser # http://textblob-de.readthedocs.org/en/latest/ # python -m pip install pymongo import sys import pprint import codecs from pymongo import MongoClient from pygermanet import load_germanet gn = load_germanet() client = MongoClient('mongodb://localhost:27017/') db = client['DumpB2C'] #collection = db['B2Cv2'] collection = db['ParsedTemp'] # https://pypi.python.org/pypi/pygermanet # https://github.com/wroberts/pygermanet # funktionieren = gn.synset(u'funktionieren.v.2') # print(funktionieren) # sys.exit(1) # http://api.mongodb.org/python/current/tutorial.html content = "" for line in fileinput.input(): content +=line #content = u"ich gehe im Wald spazieren und esse ein Eis. Meine Frau schiebt den Kinderwagen und erzählt mir von # Ihrem " \ # "Tag. Angela Merkel war leider nicht dabei." blob = TextBlob(content) text = Text(content) pp = pprint.PrettyPrinter(depth=6) #pp.pprint(s) # 1. Sätze bilden sent = content ''' sent = "" print("Sätze ausgeben:") for s in blob.sentences: #u = s.encode(encoding='UTF-8', errors='strict') print(u"s") #sent += str(u) + "#;#\n" print("\n") ''' ''' for s in blob.tokens: print(s) for s in blob.tags: print(s) ''' # 2. Pos Tagger #print("POS ausgeben:") #print(blob.parse()) #print("\n") # 3. NER Tagger nner = "" #print("NER ausgeben:") #print(text.entities) for s in text.entities: # print(u"s") nner += s + "#;#\n" #print("\n") # 4. Nomensätze with codecs.open("nomenphrasen.txt", "w", "utf-8-sig") as ntemp: ntemp.write(blob.noun_phrases) ntemp.close() ''' nphrase = "" #print("Nomenphrasen ausgeben:") for s in blob.noun_phrases: #print(u's') nphrase += s + "#;#\n" with codecs.open("nomenphrasen.txt", "w", "utf-8-sig") as ntemp: ntemp.write(s+"\n") ntemp.close() #print("\n") ''' ''' #5. Worter zählen print("Wörter zählen:") words = 0 print("\n") for s in blob.tokens: words = words + 1 print(u"s") #words += u"s" + "#;#\n" with codecs.open("allewoerter.txt", "w", "utf-8-sig") as wtemp: wtemp.write(u"s"+"\n") wtemp.close() print("\n") ''' post = {"sentences": sent, "pos_tags": blob.parse(), "ner_tags": nner, "noun_phrases": nphrase, #"word_count": words, #"word_content": words, "date": datetime.datetime.utcnow()} posts = db.posts ####posts.insert_one(post) # co-coocurrence https://github.com/maciejkula/glove-python sys.exit()