# -*- coding: utf-8 -*- #!/usr/bin/python2.7 -S # python -m spacy.en.download python -m spacy.de.download # https://spacy.io/docs/#tutorials # CSS: http://codepen.io/explosion/pen/xEpgKz # CSS 2: https://explosion.ai/blog/displacy-ent-named-entity-visualizer import time start_time = time.time() from gensim import utils from gensim.models.doc2vec import LabeledSentence from gensim.models import Doc2Vec import gensim, logging import markovify from langdetect import detect #import spacy import base64 import json import pprint import codecs import nltk import re import sys # import sys package, if not already imported reload(sys) sys.setdefaultencoding('utf-8') from textblob_de import TextBlobDE as TextBlob def only_letters(string): CHECK_RE = re.compile('[a-zA-Z]+$') lol = CHECK_RE.match(string) pprint.pprint(string) pprint.pprint(lol) return #de_nlp = spacy.load('de', tagger=True, parser=True, entity=True) filename = "/home/100biere/v3test/input.txt" # Get raw text as string. with codecs.open(filename,'r',encoding='utf8') as f: text = f.read() #pprint.pprint(sentences) #sys.exit(0) # Build the model. text_model = markovify.Text(text) # Print five randomly-generated sentences #for i in range(5): # print(text_model.make_sentence()) markov_sample = text_model.make_sentence() ''' de_doc = de_nlp(markov_sample) noun_phrases = de_doc.noun_chunks #pprint.pprint(noun_phrases) ''' blob = TextBlob(text) blob_markov = TextBlob(markov_sample) ''' print("Noun Chunks von Spacy:") for np in de_doc.noun_chunks: print(np.text) ''' nounPhrases = [] nounPhrMarkov = [] word = u("Baby") #print("Noun Phrases von TextBlog:") for np in blob.noun_phrases: v = np.encode('utf-8', 'xmlcharrefreplace') ok_upper = 0 ok_only = 0 if only_letters(v): ok_only = 1 for l in v: if l.isupper(): ok_upper = 1 if ok_upper == 1 and ok_only == 1: nounPhrases.append(v) print(v) print(v), print(ok_only), print(ok_upper) print("\n") ''' #pprint.pprint(nounPhrases) ###print("#########################") #pprint.pprint(nounPhrMarkov) for np in blob_markov.noun_phrases: #vv = np.encode('utf-8', 'xmlcharrefreplace') #nounPhrMarkov.append(vv) #matches = {x for x in nounPhrases if x in vv} #pprint.pprint("Noun Phrase: "+vv), pprint.pprint(len(matches)) ''' if any(x in blob_markov.noun_phrases for x in blob.noun_phrases): pprint.pprint("Noun Phrase hit: ") #print("#########################") print("Markov Sample Text: " + markov_sample) raw_sentences = [] #tokens = nltk.word_tokenize(text) text_sent = nltk.sent_tokenize(text) for a in text_sent: words = nltk.word_tokenize(a) #pprint.pprint(words), #print(type(a)) raw_sentences.append(a) sentences = [gensim.models.doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(raw_sentences)] model = gensim.models.Doc2Vec(sentences, iter=50, size=400, workers=1, sorted_vocab=1, alpha=0.325,min_alpha=0.225, min_count=1) model.init_sims(replace=False) # can read,write from, and also training -> more memory #model.init_sims(replace=True) # can only read from, but no more training -> less memory #model.save(path) #try: sim = model.most_similar(positive=[word], negative=[], topn=5) v = sim[0] vvv = v[1] print(vvv) #except KeyError: # 1 # java -jar /home/100biere/software/LanguageTool-3.5/languagetool-commandline.jar -adl -a /home/100biere/input.txt print("Script Runtime: --- %s seconds ---" % (time.time() - start_time)) sys.exit(0)