# -*- coding: utf-8 -*- #!/usr/bin/python3.5 -S import time #pip install --upgrade 3to2 #pip install --upgrade language-check start_time = time.time() import os os.system('clear') # http://polyglot.readthedocs.io/en/latest/Installation.html import NP #import CalcDocSimilarity import GensimCalcSimilarity import polyglot from polyglot.text import Text, Word import markovify from langdetect import detect import spacy import os.path import pprint import codecs import re import numpy import string import sys # import sys package, if not already imported from textblob_de import TextBlobDE from unidecode import unidecode allowedRule = [u'DET',u'ADJ',u'NOUN',u'VERB',u'ADV',u'ADJ'] ####allowedRule = [u'DET',u'ADJ',u'NOUN'] ####allowedRule = [u'ADJ',u'NOUN'] allowedRuleString = "".join(allowedRule).replace("PUNCT","") def createNERBasedSamples(text, markov, ndEntity): if (not ndEntity or not markov or not text or len(ndEntity) <= 0): return False #markov += "In Berlin und Umgebung" goodNERFlagAny = any(x in markov for x in ndEntity) goodNERFlagAll = all(x in markov for x in ndEntity) ''' print("\n -> Good Markov Result (NER-Based):") print(goodNERFlagAny) print(ndEntity) print(markov) print("\n\n") sys.exit(0) ''' if goodNERFlagAny: simScore = GensimCalcSimilarity.CalcDocSimilarity(text, markov) syntaxScore = GensimCalcSimilarity.CalcWordSimilarity(text, markov, train=False) syntaxScore2 = GensimCalcSimilarity.CalcCosineSimilarity(text, markov) print("###############################"), print(" -> Similarity Score (NER-Based):"), print(simScore), print("\n -> Good Markov Result (NER-Based):") print(markov) print("\n\n") with codecs.open("sample_nerbased.log",'a',encoding='utf-8') as f: f.write("\nsimScore: -> ") f.write('% 6.2f' % simScore) f.write("\n -> ") ''' f.write("\ntriesMarkov: ") f.write('% 6.2f' % triesMarkov) f.write("\n ---> max_overlap_ratioMarkov: ") f.write('% 6.2f' % max_overlap_ratioMarkov) f.write("\n ---> max_overlap_totalMarkov: ") f.write('% 6.2f' % max_overlap_totalMarkov) f.write("\n ---> ####### -> ") ''' f.write(markov) f.write("\n") f.close() #time.sleep(3) return True else: return False #print(count), #print(" -> No Markov Result (Noun Phrase)") return False def createGrammarBasedSamples(text, markov, nlp): if (not nlp or not markov or not text): return False posTagMarkov = [] toks1 = nlp(markov) for token1 in toks1: posTagMarkov.append(token1.pos_) posTagMarkovString = "".join(posTagMarkov).replace("PUNCT","") if allowedRuleString in posTagMarkovString: # print(count) # print("\n") # print("Match: (Rule-Based-Grammar)") # print("Allowed Rule"), # print(allowedRule) # print("Postag Markov:"), # print(posTagMarkov) simScore = GensimCalcSimilarity.CalcDocSimilarity(text, markov) syntaxScore = GensimCalcSimilarity.CalcWordSimilarity(text, markov, train=False) syntaxScore2 = GensimCalcSimilarity.CalcCosineSimilarity(text, markov) #print("###############################"), #print(" -> Similarity Score (Rule-Based-Grammar):"), #print(simScore), print("\n -> Good Markov Result (Rule-Based-Grammar):") print(markov) print("\n\n") with codecs.open("sample_rulebased.log",'a',encoding='utf-8') as f: f.write("\nsimScore: -> ") f.write('% 6.2f' % simScore) f.write("\n -> ") ''' f.write("\ntriesMarkov: ") f.write('% 6.2f' % triesMarkov) f.write("\n ---> max_overlap_ratioMarkov: ") f.write('% 6.2f' % max_overlap_ratioMarkov) f.write("\n ---> max_overlap_totalMarkov: ") f.write('% 6.2f' % max_overlap_totalMarkov) f.write("\n ---> ####### -> ") ''' f.write(markov) f.write("\n") f.close() #time.sleep(3) return True else: return False #print(count), #print(" -> No Markov Result (Rule-Based-Grammar)") return False def createNounPhraseSamples(text, markov, nounPhrases, nlp): if (not nlp or not markov or not text or not nounPhrases): return False iNP = NP.tokenize(markov, nlp) goodFlag = False #flache liste von iNP -> immer nur ein Element, und da stecken dann nPList = nounPhrases[0] tmpList = iNP[0] goodFlagAny = any(x in tmpList for x in nPList) goodFlagAll = all(x in tmpList for x in nPList) if goodFlagAll: #print("Match (Noun Phrase MARKOV):"), #print(tmpList) #print("Match (Noun Phrase STANDARD):"), #print(nPList) #print("Match ANY (Noun Phrase):"), #print(goodFlagAny), #print("Match ALL (Noun Phrase):"), #print(goodFlagAll) # simScore = GensimCalcSimilarity.CalcDocSimilarity(text, markov) syntaxScore = GensimCalcSimilarity.CalcWordSimilarity(text, markov, train=False) syntaxScore2 = GensimCalcSimilarity.CalcCosineSimilarity(text, markov) print("###############################"), print(" -> Similarity Score (Noun-Phrase):"), print(simScore), print("\n -> Good Markov Result (Noun-Phrase):") print(markov) print("\n\n") with codecs.open("sample_nounphrase.log",'a',encoding='utf-8') as f: f.write("\nsimScore: -> ") f.write('% 6.2f' % simScore) f.write("\n -> ") ''' f.write("\ntriesMarkov: ") f.write('% 6.2f' % triesMarkov) f.write("\n ---> max_overlap_ratioMarkov: ") f.write('% 6.2f' % max_overlap_ratioMarkov) f.write("\n ---> max_overlap_totalMarkov: ") f.write('% 6.2f' % max_overlap_totalMarkov) f.write("\n ---> ####### -> ") ''' f.write(markov) f.write("\n") f.close() #time.sleep(3) return True else: 1 #print(count), #print(" -> No Markov Result (Noun Phrase)") return False return False