#!/usr/bin/env python # coding: utf8 # -*- coding: utf-8 -*- # Quelle: https://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html ''' NLTK, a natural language toolkit for Python. A useful package for any natural language processing. For Mac/Unix with pip: $ sudo pip install -U nltk stop_words, a Python package containing stop words. For Mac/Unix with pip: $ sudo pip install stop-words gensim, a topic modeling package containing our LDA model. For Mac/Unix with pip: $ sudo pip install gensim ''' import os os.environ["PATH"] = "/usr/local/lib/python2.7/dist-packages/gensim#" from nltk.tokenize import RegexpTokenizer from stop_words import get_stop_words from nltk.stem.porter import PorterStemmer from gensim import corpora, models import gensim import nltk from nltk.tokenize import sent_tokenize, word_tokenize import sys import codecs sys.path.append("/usr/local/lib/python2.7/") def deumlaut(s): """ Replaces umlauts with fake-umlauts """ s = s.replace('\xdf', 'ss') s = s.replace('\xfc', 'ü') s = s.replace('\xdc', 'Ü') s = s.replace('\xf6', 'ö') s = s.replace('\xd6', 'Ö') s = s.replace('\xe4', 'ä') s = s.replace('\xc4', 'Ä') s = s.replace('ö', 'oe') s = s.replace('ä', 'ae') s = s.replace('ü', 'ue') s = s.replace('Ü', 'Ue') s = s.replace('Ö', 'Oe') s = s.replace('Ä', 'Ae') #s = s.replace('\xdf', 'ss') #s = s.replace('\xfc', 'ue') #s = s.replace('\xdc', 'Ue') #s = s.replace('\xf6', 'oe') #s = s.replace('\xd6', 'Oe') #s = s.replace('\xe4', 'ae') # s = s.replace('\xc4', 'Ae') return s tokenizer = RegexpTokenizer(r'\w+') # create English stop words list de_stop = get_stop_words('german') # Create p_stemmer of class PorterStemmer p_stemmer = PorterStemmer() filename = sys.argv[1] f = codecs.open(filename, "r", "utf-8") text_unicode = f.read().encode("utf-8") text_unicode = deumlaut(text_unicode) ''' # create sample documents doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother." doc_b = "My mother spends a lot of time driving my brother around to baseball practice." doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure." doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better." doc_e = "Health professionals say that brocolli is good for your health." test_text = "Ein rosa Kleidchen, ein farblich passendes Strohhütchen auf dem Kopf, ein breites Lächeln: Carmen Geiss " \ "(50), wie man sie kennt. Für dieses Foto kassiert die Kölner Kult-Millionärin gerade allerdings einen " \ "üblen Shitstorm. Der Grund: Die Urlauberpose hat Carmen in einem kolumbianischen Armenviertel aufgenommen. Dazu die Facebook-Erklärung: „HEUTE GEHT ES MAL IN DIE SLUMS VON CARTAGENA“ Neben Carmens schrillem Outfit sorgt auch die Anreise der Geissens für Empörung: Die Millionärs-Familie legt im Luxus-Bötchen, das den Namen „Roberto Geissini“ trägt, im Hafen der Armen an." ''' #text_unicode = test_text.decode('utf-8') doc_set = sent_tokenize(text_unicode.decode("utf8")) # compile sample documents into a list #doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e] # list for tokenized documents in loop texts = [] # loop through document list for i in doc_set: # clean and tokenize document string raw = i.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in de_stop] # stem tokens stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] # add tokens to list texts.append(stemmed_tokens) # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts] # generate LDA model ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=128, alpha='auto', eval_every=5) ''' https://radimrehurek.com/gensim/models/ldamodel.html LDA module können trainiert werden - YEEEEEESSS ''' print("

LDAPROFILER OUTPUT: ") print(ldamodel.print_topics(num_topics=2, num_words=2))