📄 Doc2Vec.corp (Matlab) 2.8 KB 2016-10-31
Source code file for Doc2Vec
{
"@context": "https://schema.org",
"@type": "SoftwareSourceCode",
"name": "Doc2Vec.corp",
"description": "Source code file for Doc2Vec",
"dateModified": "2016-10-31",
"dateCreated": "2025-03-23",
"contentSize": "2.8 KB",
"contentUrl": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/Doc2Vec.corp",
"programmingLanguage": {
"@type": "ComputerLanguage",
"name": "Matlab"
},
"codeRepository": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/"
}
🐍 GensimCalcSimilarity.py (Python) 8.0 KB 2016-11-06
Python module for GensimCalcSimilarity
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52 | # -*- coding: utf-8 -*-
#!/usr/bin/python3.5 -S
import time
import gensim
import nltk
import os, sys
import logging
import gensim
from gensim import corpora
import codecs
from collections import defaultdict
from pprint import pprint # pretty-printer
from gensim import corpora, models, similarities
# https://radimrehurek.com/gensim/apiref.html
#pip install pyemd
from gensim.models import word2vec
from gensim.models import doc2vec
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
#pip install --upgrade sklearn
word2vecpath = "/home/100biere/demo/realdemo/Word2Vec.mod"
doc2vecpath = "/home/100biere/demo/realdemo/Doc2Vec.mod"
doc2vecpathDict = "/home/100biere/demo/realdemo/Doc2Vec.dict"
doc2vecpathCorp = "/home/100biere/demo/realdemo/Doc2Vec.corp"
stopword = "/home/100biere/demo/realdemo/stopwordlist.de.txt"
def CalcCosineSimilarity(documents, markov_sentence):
with codecs.open(stopword,'r', encoding='utf-8') as f:
stopwords_tmp = f.read()
f.close()
stopwords = set(stopwords_tmp.strip().split())
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
def stem_tokens(tokens):
return [stemmer.stem(item) for item in tokens]
'''remove punctuation, lowercase, stem'''
def normalize(text):
return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))
vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words=stopwords)
... [truncated, 170 more lines] ...
|
{
"@context": "https://schema.org",
"@type": "SoftwareSourceCode",
"name": "GensimCalcSimilarity.py",
"description": "Python module for GensimCalcSimilarity",
"dateModified": "2016-11-06",
"dateCreated": "2025-03-23",
"contentSize": "8.0 KB",
"contentUrl": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/GensimCalcSimilarity.py",
"encodingFormat": "application/x-python",
"programmingLanguage": {
"@type": "ComputerLanguage",
"name": "Python"
},
"codeRepository": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/"
}
🐍 NP.py (Python) 4.3 KB 2016-10-29
Python module for NP
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52 | # -*- coding: utf-8 -*-
#!/usr/bin/python3.5 -S
"""
A naive keyword extractor which just pulls out nouns and noun phrases.
Was using the PerceptronTagger is _way_ faster than NLTK's default tagger, and more accurate to boot.
See <http://stevenloria.com/tutorial-state-of-the-art-part-of-speech-tagging-in-textblob/>.
However, it complicates the library's installation, and the spacy tagger is quite fast and good too.
https://github.com/frnsys/broca/blob/master/broca/common/util.py
"""
from textblob_de import TextBlobDE
#import spacy
import codecs
CFG = {
('NNP', 'NNP'): 'NNP',
('NN', 'NN'): 'NNI',
('NNI', 'NN'): 'NNI',
('JJ', 'JJ'): 'JJ',
('JJ', 'NN'): 'NNI',
}
def check_term(tdoc, term):
if term not in tdoc:
return False
# If this term occurs outside of a phrase,
# it is no longer a candidate
n = tdoc.count(term)
# Count phrases that contain t
d = sum(1 for ph in tdoc if term != ph and term in ph)
return n > d
def check_phrase(phrase, term):
return term in phrase
def gram_size(term):
"""
Convenience func for getting n-gram length.
"""
return len(term.split(' '))
def prune(tdocs):
"""
Prune terms which are totally subsumed by a phrase
This could be better if it just removes the individual keywords
that occur in a phrase for each time that phrase occurs.
... [truncated, 112 more lines] ...
|
{
"@context": "https://schema.org",
"@type": "SoftwareSourceCode",
"name": "NP.py",
"description": "Python module for NP",
"dateModified": "2016-10-29",
"dateCreated": "2025-03-23",
"contentSize": "4.3 KB",
"contentUrl": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/NP.py",
"encodingFormat": "application/x-python",
"programmingLanguage": {
"@type": "ComputerLanguage",
"name": "Python"
},
"codeRepository": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/"
}
🐍 RuleBasedSamples.py (Python) 5.7 KB 2016-11-06
Python module for RuleBasedSamples
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52 | # -*- coding: utf-8 -*-
#!/usr/bin/python3.5 -S
import time
#pip install --upgrade 3to2
#pip install --upgrade language-check
start_time = time.time()
import os
os.system('clear')
# http://polyglot.readthedocs.io/en/latest/Installation.html
import NP
#import CalcDocSimilarity
import GensimCalcSimilarity
import polyglot
from polyglot.text import Text, Word
import markovify
from langdetect import detect
import spacy
import os.path
import pprint
import codecs
import re
import numpy
import string
import sys # import sys package, if not already imported
from textblob_de import TextBlobDE
from unidecode import unidecode
allowedRule = [u'DET',u'ADJ',u'NOUN',u'VERB',u'ADV',u'ADJ']
####allowedRule = [u'DET',u'ADJ',u'NOUN']
####allowedRule = [u'ADJ',u'NOUN']
allowedRuleString = "".join(allowedRule).replace("PUNCT","")
def createNERBasedSamples(text, markov, ndEntity):
if (not ndEntity or not markov or not text or len(ndEntity) <= 0):
return False
#markov += "In Berlin und Umgebung"
goodNERFlagAny = any(x in markov for x in ndEntity)
goodNERFlagAll = all(x in markov for x in ndEntity)
'''
print("\n -> Good Markov Result (NER-Based):")
print(goodNERFlagAny)
print(ndEntity)
print(markov)
... [truncated, 157 more lines] ...
|
{
"@context": "https://schema.org",
"@type": "SoftwareSourceCode",
"name": "RuleBasedSamples.py",
"description": "Python module for RuleBasedSamples",
"dateModified": "2016-11-06",
"dateCreated": "2025-03-23",
"contentSize": "5.7 KB",
"contentUrl": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/RuleBasedSamples.py",
"encodingFormat": "application/x-python",
"programmingLanguage": {
"@type": "ComputerLanguage",
"name": "Python"
},
"codeRepository": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/"
}
🐍 SemanticSimilarity.py (Python) 10.1 KB 2016-11-06
Python module for SemanticSimilarity
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52 | from __future__ import division
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import brown
import math
import numpy as np
import sys
# http://sujitpal.blogspot.de/2014/12/semantic-similarity-for-short-sentences.html
# Parameters to the algorithm. Currently set to values that was reported
# in the paper to produce "best" results.
ALPHA = 0.2
BETA = 0.45
ETA = 0.4
PHI = 0.2
DELTA = 0.85
brown_freqs = dict()
N = 0
######################### word similarity ##########################
def get_best_synset_pair(word_1, word_2):
"""
Choose the pair with highest path similarity among all pairs.
Mimics pattern-seeking behavior of humans.
"""
max_sim = -1.0
synsets_1 = wn.synsets(word_1)
synsets_2 = wn.synsets(word_2)
if len(synsets_1) == 0 or len(synsets_2) == 0:
return None, None
else:
max_sim = -1.0
best_pair = None, None
for synset_1 in synsets_1:
for synset_2 in synsets_2:
sim = wn.path_similarity(synset_1, synset_2)
if sim > max_sim:
max_sim = sim
best_pair = synset_1, synset_2
return best_pair
def length_dist(synset_1, synset_2):
"""
Return a measure of the length of the shortest path in the semantic
ontology (Wordnet in our case as well as the paper's) between two
synsets.
"""
l_dist = sys.maxint
... [truncated, 200 more lines] ...
|
{
"@context": "https://schema.org",
"@type": "SoftwareSourceCode",
"name": "SemanticSimilarity.py",
"description": "Python module for SemanticSimilarity",
"dateModified": "2016-11-06",
"dateCreated": "2025-03-23",
"contentSize": "10.1 KB",
"contentUrl": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/SemanticSimilarity.py",
"encodingFormat": "application/x-python",
"programmingLanguage": {
"@type": "ComputerLanguage",
"name": "Python"
},
"codeRepository": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/"
}
🐍 a1g.py (Python) 3.9 KB 2016-11-06
Python module for a1g
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52 | # -*- coding: utf-8 -*-
#!/usr/bin/python3.5 -S
# python -m spacy.en.download python -m spacy.de.download
# https://spacy.io/docs/#tutorials
#https://www.w3.org/services/html2txt
# CSS: http://codepen.io/explosion/pen/xEpgKz
# CSS 2: https://explosion.ai/blog/displacy-ent-named-entity-visualizer
import time
#pip install --upgrade 3to2
#pip install --upgrade language-check
### grammar checker
######## https://github.com/lpenz/atdtool/blob/master/atdtool/__init__.py
###########http://stackoverflow.com/questions/10252448/how-to-check-whether-a-sentence-is-correct-simple-grammar-check-in-python --> https://pypi.python.org/pypi/grammar-check https://pypi.python.org/pypi/language-check
#Statistical spell- and (occasional) grammar-checker. http://lindat.mff.cuni.cz/services/korektor
#https://github.com/ufal/korektor
#/home/100biere/demo/tensorflow/models/syntaxnet#
# https://wiki.python.org/moin/LanguageParsing
# http://nlp.stanford.edu/software/lex-parser.shtml
#https://github.com/jsvine/markovify
# spell correction: http://norvig.com/spell-correct.html
# grammar correct: http://www.abisource.com/projects/link-grammar/#download
# python based grammar check based on learning https://www.openhub.net/p/grac
# http://www.decontextualize.com/teaching/rwet/n-grams-and-markov-chains/
start_time = time.time()
import os
os.system('clear')
# http://polyglot.readthedocs.io/en/latest/Installation.html
import NP
import RuleBasedSamples
#import GensimCalcSimilarity
from RuleBasedSamples import *
import polyglot
from polyglot.text import Text, Word
import markovify
from langdetect import detect
import spacy
import os.path
import pprint
import codecs
... [truncated, 76 more lines] ...
|
{
"@context": "https://schema.org",
"@type": "SoftwareSourceCode",
"name": "a1g.py",
"description": "Python module for a1g",
"dateModified": "2016-11-06",
"dateCreated": "2025-03-23",
"contentSize": "3.9 KB",
"contentUrl": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/a1g.py",
"encodingFormat": "application/x-python",
"programmingLanguage": {
"@type": "ComputerLanguage",
"name": "Python"
},
"codeRepository": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/"
}
📄 instagram_ohneumlauts.txt (Text) 37.7 KB 2016-10-29
Source code file for instagram ohneumlauts
{
"@context": "https://schema.org",
"@type": "SoftwareSourceCode",
"name": "instagram_ohneumlauts.txt",
"description": "Source code file for instagram ohneumlauts",
"dateModified": "2016-10-29",
"dateCreated": "2025-03-23",
"contentSize": "37.7 KB",
"contentUrl": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/instagram_ohneumlauts.txt",
"encodingFormat": "text/plain",
"programmingLanguage": {
"@type": "ComputerLanguage",
"name": "Text"
},
"codeRepository": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/"
}
🗃️ sample_nounphrase.log (Sql) 814.8 KB 2016-11-04
Source code file for sample nounphrase
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51 | simScore: -> 0.59
-> Um mehr Follower zu Ihrem Thema zu tun haben, zu loeschen, tippen Sie unterhalb des Fotos oder Videos.
simScore: -> 1.12
-> Auch wenn es Ihnen darum geht, nachhaltig Follower aufzubauen, die Ihr Profil anderen Nutzern sowie Liken und Kommentieren von Beitraegen anderer User, als auch Fotos und Videos zudem deutlich weniger komplex als andere Plattformen.
simScore: -> 1.22
-> Um mehr Follower auf Instagram ausschliesslich Ihre eigenen Fotos und Videos von unterwegs aus.
simScore: -> 1.48
-> Abgesehen davon, dass die wenigsten Ihrer neuen Follower zu Ihrem Bild ausdruecken, was jedoch nicht der Etikette auf Instagram ein Mosaik von Fotos und Videos besitzen, auch wenn Sie ohne weitere Erklaerung ein Foto von einem schlagartigen Anstieg seiner Follower und Likes.
simScore: -> 1.42
-> Dabei ist es auch, dass Sie die Moeglichkeit, das Foto auf Instagram ein Mosaik von Fotos und Videos besitzen, auch wenn Sie ohne weitere Erklaerung ein Foto von einem schlagartigen Anstieg seiner Follower und Likes.
simScore: -> 1.30
-> Instagram ist mit seinem Fokus auf Fotos und Videos besitzen, auch wenn Sie ohne weitere Erklaerung ein Foto von einem schlagartigen Anstieg seiner Follower und Likes.
simScore: -> 0.88
-> Kommentare lassen sich sowohl Ihren eigenen als auch Fotos und Videos besitzen, auch wenn Sie ohne weitere Erklaerung ein Foto von einem schlagartigen Anstieg seiner Follower und Likes.
simScore: -> 1.27
-> Hier ist neben Ihren ansprechenden Fotos und Videos des Spaziergangs aus den unterschiedlichsten Perspektiven und ein aehnliches Thema wie Sie nachhaltig auf Ihre Follower irritierend wirken, wenn Sie sie oeffentlich auf Instagram sein.
simScore: -> 1.27
-> Am Anfang wird es sich gerade zu Beginn Ihrer Strategie, mehr Follower auf Instagram ausschliesslich Ihre eigenen Fotos und Videos von unterwegs aus.
simScore: -> 1.57
-> Auch wenn es Ihnen darum geht, nachhaltig Follower aufzubauen, die Ihr Profil an und wischen Sie mit dem speziellen Projekt-Hashtag versehen sind, werden von Instagram zum jetzigen Zeitpunkt nicht das Reposten von Fotos und Videos tabu und koennen als Spam gemeldet werden.
simScore: -> 1.35
-> Ein InstaWalk ist ein gemeinsamer Spaziergang durch die Stadt oder die Sie abonnieren, sollte zu Beginn Ihrer Strategie, mehr Follower auf Instagram angemeldet hat, mit dem Posten von Fotos oder Videos tippen.
simScore: -> 0.78
-> Hier ist neben Ihren ansprechenden Fotos und Videos besitzen, auch wenn Sie ohne weitere Erklaerung ein Foto von einem schlagartigen Anstieg seiner Follower und Likes.
simScore: -> 0.91
-> Zum anderen posten diese Communities haeufig Fotos und Videos besitzen, auch wenn Sie ohne weitere Erklaerung ein Foto von einem schlagartigen Anstieg seiner Follower und Likes.
simScore: -> 1.42
-> Auch wenn es Ihnen darum geht, nachhaltig Follower aufzubauen, die Ihr Profil anderen Nutzern sowie Liken und Kommentieren von Beitraegen anderer User, als auch Fotos und Videos von Instagrammern, die ihre Beitraege mit dem Profil des anderen auseinanderzusetzen.
simScore: -> 1.23
-> Um mehr Follower auf Instagram ausschliesslich Ihre eigenen Fotos und Videos zudem deutlich weniger komplex als andere Plattformen.
simScore: -> 0.94
-> Auch wenn es Ihnen darum geht, nachhaltig Follower aufzubauen, die Ihr Profil an und wischen Sie mit dem Posten von Fotos und Videos von unterwegs aus.
simScore: -> 1.37
... [truncated, 10559 more lines] ...
|
{
"@context": "https://schema.org",
"@type": "SoftwareSourceCode",
"name": "sample_nounphrase.log",
"description": "Source code file for sample nounphrase",
"dateModified": "2016-11-04",
"dateCreated": "2025-03-23",
"contentSize": "814.8 KB",
"contentUrl": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/sample_nounphrase.log",
"programmingLanguage": {
"@type": "ComputerLanguage",
"name": "Sql"
},
"codeRepository": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/"
}
📄 stopwordlist.de.txt (Text) 8.3 KB 2016-10-22
Source code file for stopwordlist.de
{
"@context": "https://schema.org",
"@type": "SoftwareSourceCode",
"name": "stopwordlist.de.txt",
"description": "Source code file for stopwordlist.de",
"dateModified": "2016-10-22",
"dateCreated": "2025-03-23",
"contentSize": "8.3 KB",
"contentUrl": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/stopwordlist.de.txt",
"encodingFormat": "text/plain",
"programmingLanguage": {
"@type": "ComputerLanguage",
"name": "Text"
},
"codeRepository": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/"
}
version/
(7 files) 🐍 GensimCalcSimilarity.py (Python) 10.3 KB 2016-11-05
Python module for GensimCalcSimilarity
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52 | # -*- coding: utf-8 -*-
#!/usr/bin/python3.5 -S
import time
import gensim
import nltk
import os, sys
import logging
import gensim
from gensim import corpora
import codecs
from collections import defaultdict
from pprint import pprint # pretty-printer
from gensim import corpora, models, similarities
# https://radimrehurek.com/gensim/apiref.html
#pip install pyemd
from gensim.models import word2vec
from gensim.models import doc2vec
word2vecpath = "/home/100biere/demo/realdemo/Word2Vec.mod"
doc2vecpath = "/home/100biere/demo/realdemo/Doc2Vec.mod"
doc2vecpathDict = "/home/100biere/demo/realdemo/Doc2Vec.dict"
doc2vecpathCorp = "/home/100biere/demo/realdemo/Doc2Vec.corp"
stopword = "/home/100biere/demo/realdemo/stopwordlist.de.txt"
def CalcWordSimilarity(documents, markov_sentence, train=True):
#https://radimrehurek.com/gensim/models/phrases.html#module-gensim.models.phrases
# Gensim Phrases bilden Bi und Trigramme ab
# # # n_similarity(ws1, ws2)¶
###### score(sentences, total_sentences=1000000, chunksize=100, queue_factor=2, report_delay=1) --> https://radimrehurek.com/gensim/models/word2vec.html
if train and os.path.isfile(word2vecpath):
model = gensim.models.Word2Vec.load(word2vecpath)
model = gensim.models.Word2Vec(documents, sg=1, hs=1, iter=150, size=800, workers=1, sorted_vocab=1, alpha=0.325, min_count=1)
model.init_sims(replace=False) # can read,write from, and also training -> more memory
#model.init_sims(replace=True) # can only read from, but no more training -> less memory
model.save(word2vecpath)
return True
elif train and not os.path.isfile(word2vecpath):
model = gensim.models.Word2Vec(documents, sg=1, hs=1, iter=150, size=800, workers=1, sorted_vocab=1, alpha=0.325, min_count=1)
model.init_sims(replace=False) # can read,write from, and also training -> more memory
#model.init_sims(replace=True) # can only read from, but no more training -> less memory
model.save(word2vecpath)
return True
elif os.path.isfile(word2vecpath):
model = gensim.models.Word2Vec.load(word2vecpath)
else:
model = gensim.models.Word2Vec(documents, sg=1, hs=1, iter=50, size=400, workers=1, sorted_vocab=1, alpha=0.325, min_count=1)
model.init_sims(replace=False) # can read,write from, and also training -> more memory
... [truncated, 228 more lines] ...
|
{
"@context": "https://schema.org",
"@type": "SoftwareSourceCode",
"name": "GensimCalcSimilarity.py",
"description": "Python module for GensimCalcSimilarity",
"dateModified": "2016-11-05",
"dateCreated": "2025-03-23",
"contentSize": "10.3 KB",
"contentUrl": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/version/GensimCalcSimilarity.py",
"encodingFormat": "application/x-python",
"programmingLanguage": {
"@type": "ComputerLanguage",
"name": "Python"
},
"codeRepository": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/"
}
🐍 NP.py (Python) 4.3 KB 2016-10-29
Python module for NP
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52 | # -*- coding: utf-8 -*-
#!/usr/bin/python3.5 -S
"""
A naive keyword extractor which just pulls out nouns and noun phrases.
Was using the PerceptronTagger is _way_ faster than NLTK's default tagger, and more accurate to boot.
See <http://stevenloria.com/tutorial-state-of-the-art-part-of-speech-tagging-in-textblob/>.
However, it complicates the library's installation, and the spacy tagger is quite fast and good too.
https://github.com/frnsys/broca/blob/master/broca/common/util.py
"""
from textblob_de import TextBlobDE
#import spacy
import codecs
CFG = {
('NNP', 'NNP'): 'NNP',
('NN', 'NN'): 'NNI',
('NNI', 'NN'): 'NNI',
('JJ', 'JJ'): 'JJ',
('JJ', 'NN'): 'NNI',
}
def check_term(tdoc, term):
if term not in tdoc:
return False
# If this term occurs outside of a phrase,
# it is no longer a candidate
n = tdoc.count(term)
# Count phrases that contain t
d = sum(1 for ph in tdoc if term != ph and term in ph)
return n > d
def check_phrase(phrase, term):
return term in phrase
def gram_size(term):
"""
Convenience func for getting n-gram length.
"""
return len(term.split(' '))
def prune(tdocs):
"""
Prune terms which are totally subsumed by a phrase
This could be better if it just removes the individual keywords
that occur in a phrase for each time that phrase occurs.
... [truncated, 112 more lines] ...
|
{
"@context": "https://schema.org",
"@type": "SoftwareSourceCode",
"name": "NP.py",
"description": "Python module for NP",
"dateModified": "2016-10-29",
"dateCreated": "2025-03-23",
"contentSize": "4.3 KB",
"contentUrl": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/version/NP.py",
"encodingFormat": "application/x-python",
"programmingLanguage": {
"@type": "ComputerLanguage",
"name": "Python"
},
"codeRepository": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/"
}
🐍 RuleBasedSamples.py (Python) 5.0 KB 2016-11-05
Python module for RuleBasedSamples
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52 | # -*- coding: utf-8 -*-
#!/usr/bin/python3.5 -S
import time
#pip install --upgrade 3to2
#pip install --upgrade language-check
start_time = time.time()
import os
os.system('clear')
# http://polyglot.readthedocs.io/en/latest/Installation.html
import NP
#import CalcDocSimilarity
import GensimCalcSimilarity
import polyglot
from polyglot.text import Text, Word
import markovify
from langdetect import detect
import spacy
import os.path
import pprint
import codecs
import re
import numpy
import string
import sys # import sys package, if not already imported
from textblob_de import TextBlobDE
from unidecode import unidecode
allowedRule = [u'DET',u'ADJ',u'NOUN',u'VERB',u'ADV',u'ADJ']
####allowedRule = [u'DET',u'ADJ',u'NOUN']
####allowedRule = [u'ADJ',u'NOUN']
allowedRuleString = "".join(allowedRule).replace("PUNCT","")
def createNERBasedSamples(text, markov, ndEntity):
#markov += "In Berlin und Umgebung"
goodNERFlagAny = any(x in markov for x in ndEntity)
goodNERFlagAll = all(x in markov for x in ndEntity)
'''
print("\n -> Good Markov Result (NER-Based):")
print(goodFlagAny)
print(markov)
print("\n\n")
sys.exit(0)
'''
if goodNERFlagAll:
... [truncated, 135 more lines] ...
|
{
"@context": "https://schema.org",
"@type": "SoftwareSourceCode",
"name": "RuleBasedSamples.py",
"description": "Python module for RuleBasedSamples",
"dateModified": "2016-11-05",
"dateCreated": "2025-03-23",
"contentSize": "5.0 KB",
"contentUrl": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/version/RuleBasedSamples.py",
"encodingFormat": "application/x-python",
"programmingLanguage": {
"@type": "ComputerLanguage",
"name": "Python"
},
"codeRepository": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/"
}
🐍 SemanticSimilarity.py (Python) 12.0 KB 2016-10-30
Python module for SemanticSimilarity
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52 | from __future__ import division
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import brown
import math
import numpy as np
import sys
# http://sujitpal.blogspot.de/2014/12/semantic-similarity-for-short-sentences.html
# Parameters to the algorithm. Currently set to values that was reported
# in the paper to produce "best" results.
ALPHA = 0.2
BETA = 0.45
ETA = 0.4
PHI = 0.2
DELTA = 0.85
brown_freqs = dict()
N = 0
######################### word similarity ##########################
def get_best_synset_pair(word_1, word_2):
"""
Choose the pair with highest path similarity among all pairs.
Mimics pattern-seeking behavior of humans.
"""
max_sim = -1.0
synsets_1 = wn.synsets(word_1)
synsets_2 = wn.synsets(word_2)
if len(synsets_1) == 0 or len(synsets_2) == 0:
return None, None
else:
max_sim = -1.0
best_pair = None, None
for synset_1 in synsets_1:
for synset_2 in synsets_2:
sim = wn.path_similarity(synset_1, synset_2)
if sim > max_sim:
max_sim = sim
best_pair = synset_1, synset_2
return best_pair
def length_dist(synset_1, synset_2):
"""
Return a measure of the length of the shortest path in the semantic
ontology (Wordnet in our case as well as the paper's) between two
synsets.
"""
l_dist = sys.maxint
... [truncated, 253 more lines] ...
|
{
"@context": "https://schema.org",
"@type": "SoftwareSourceCode",
"name": "SemanticSimilarity.py",
"description": "Python module for SemanticSimilarity",
"dateModified": "2016-10-30",
"dateCreated": "2025-03-23",
"contentSize": "12.0 KB",
"contentUrl": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/version/SemanticSimilarity.py",
"encodingFormat": "application/x-python",
"programmingLanguage": {
"@type": "ComputerLanguage",
"name": "Python"
},
"codeRepository": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/"
}
🐍 a1e.py (Python) 7.2 KB 2016-10-31
Python module for a1e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52 | # -*- coding: utf-8 -*-
#!/usr/bin/python3.5 -S
# python -m spacy.en.download python -m spacy.de.download
# https://spacy.io/docs/#tutorials
#https://www.w3.org/services/html2txt
# CSS: http://codepen.io/explosion/pen/xEpgKz
# CSS 2: https://explosion.ai/blog/displacy-ent-named-entity-visualizer
import time
#pip install --upgrade 3to2
#pip install --upgrade language-check
### grammar checker
######## https://github.com/lpenz/atdtool/blob/master/atdtool/__init__.py
###########http://stackoverflow.com/questions/10252448/how-to-check-whether-a-sentence-is-correct-simple-grammar-check-in-python --> https://pypi.python.org/pypi/grammar-check https://pypi.python.org/pypi/language-check
#Statistical spell- and (occasional) grammar-checker. http://lindat.mff.cuni.cz/services/korektor
#https://github.com/ufal/korektor
#/home/100biere/demo/tensorflow/models/syntaxnet#
# https://wiki.python.org/moin/LanguageParsing
# http://nlp.stanford.edu/software/lex-parser.shtml
#https://github.com/jsvine/markovify
# spell correction: http://norvig.com/spell-correct.html
# grammar correct: http://www.abisource.com/projects/link-grammar/#download
# python based grammar check based on learning https://www.openhub.net/p/grac
# http://www.decontextualize.com/teaching/rwet/n-grams-and-markov-chains/
start_time = time.time()
import os
os.system('clear')
# http://polyglot.readthedocs.io/en/latest/Installation.html
import NP
#import CalcDocSimilarity
import GensimCalcSimilarity
import polyglot
from polyglot.text import Text, Word
import markovify
from langdetect import detect
import spacy
import os.path
import pprint
import codecs
import re
... [truncated, 206 more lines] ...
|
{
"@context": "https://schema.org",
"@type": "SoftwareSourceCode",
"name": "a1e.py",
"description": "Python module for a1e",
"dateModified": "2016-10-31",
"dateCreated": "2025-03-23",
"contentSize": "7.2 KB",
"contentUrl": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/version/a1e.py",
"encodingFormat": "application/x-python",
"programmingLanguage": {
"@type": "ComputerLanguage",
"name": "Python"
},
"codeRepository": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/"
}
🐍 a1f.py (Python) 3.8 KB 2016-10-31
Python module for a1f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52 | # -*- coding: utf-8 -*-
#!/usr/bin/python3.5 -S
# python -m spacy.en.download python -m spacy.de.download
# https://spacy.io/docs/#tutorials
#https://www.w3.org/services/html2txt
# CSS: http://codepen.io/explosion/pen/xEpgKz
# CSS 2: https://explosion.ai/blog/displacy-ent-named-entity-visualizer
import time
#pip install --upgrade 3to2
#pip install --upgrade language-check
### grammar checker
######## https://github.com/lpenz/atdtool/blob/master/atdtool/__init__.py
###########http://stackoverflow.com/questions/10252448/how-to-check-whether-a-sentence-is-correct-simple-grammar-check-in-python --> https://pypi.python.org/pypi/grammar-check https://pypi.python.org/pypi/language-check
#Statistical spell- and (occasional) grammar-checker. http://lindat.mff.cuni.cz/services/korektor
#https://github.com/ufal/korektor
#/home/100biere/demo/tensorflow/models/syntaxnet#
# https://wiki.python.org/moin/LanguageParsing
# http://nlp.stanford.edu/software/lex-parser.shtml
#https://github.com/jsvine/markovify
# spell correction: http://norvig.com/spell-correct.html
# grammar correct: http://www.abisource.com/projects/link-grammar/#download
# python based grammar check based on learning https://www.openhub.net/p/grac
# http://www.decontextualize.com/teaching/rwet/n-grams-and-markov-chains/
start_time = time.time()
import os
os.system('clear')
# http://polyglot.readthedocs.io/en/latest/Installation.html
import NP
import RuleBasedSamples
#import GensimCalcSimilarity
from RuleBasedSamples import *
import polyglot
from polyglot.text import Text, Word
import markovify
from langdetect import detect
import spacy
import os.path
import pprint
import codecs
... [truncated, 73 more lines] ...
|
{
"@context": "https://schema.org",
"@type": "SoftwareSourceCode",
"name": "a1f.py",
"description": "Python module for a1f",
"dateModified": "2016-10-31",
"dateCreated": "2025-03-23",
"contentSize": "3.8 KB",
"contentUrl": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/version/a1f.py",
"encodingFormat": "application/x-python",
"programmingLanguage": {
"@type": "ComputerLanguage",
"name": "Python"
},
"codeRepository": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/"
}
🐍 a1g.py (Python) 3.9 KB 2016-11-05
Python module for a1g
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52 | # -*- coding: utf-8 -*-
#!/usr/bin/python3.5 -S
# python -m spacy.en.download python -m spacy.de.download
# https://spacy.io/docs/#tutorials
#https://www.w3.org/services/html2txt
# CSS: http://codepen.io/explosion/pen/xEpgKz
# CSS 2: https://explosion.ai/blog/displacy-ent-named-entity-visualizer
import time
#pip install --upgrade 3to2
#pip install --upgrade language-check
### grammar checker
######## https://github.com/lpenz/atdtool/blob/master/atdtool/__init__.py
###########http://stackoverflow.com/questions/10252448/how-to-check-whether-a-sentence-is-correct-simple-grammar-check-in-python --> https://pypi.python.org/pypi/grammar-check https://pypi.python.org/pypi/language-check
#Statistical spell- and (occasional) grammar-checker. http://lindat.mff.cuni.cz/services/korektor
#https://github.com/ufal/korektor
#/home/100biere/demo/tensorflow/models/syntaxnet#
# https://wiki.python.org/moin/LanguageParsing
# http://nlp.stanford.edu/software/lex-parser.shtml
#https://github.com/jsvine/markovify
# spell correction: http://norvig.com/spell-correct.html
# grammar correct: http://www.abisource.com/projects/link-grammar/#download
# python based grammar check based on learning https://www.openhub.net/p/grac
# http://www.decontextualize.com/teaching/rwet/n-grams-and-markov-chains/
start_time = time.time()
import os
os.system('clear')
# http://polyglot.readthedocs.io/en/latest/Installation.html
import NP
import RuleBasedSamples
#import GensimCalcSimilarity
from RuleBasedSamples import *
import polyglot
from polyglot.text import Text, Word
import markovify
from langdetect import detect
import spacy
import os.path
import pprint
import codecs
... [truncated, 73 more lines] ...
|
{
"@context": "https://schema.org",
"@type": "SoftwareSourceCode",
"name": "a1g.py",
"description": "Python module for a1g",
"dateModified": "2016-11-05",
"dateCreated": "2025-03-23",
"contentSize": "3.9 KB",
"contentUrl": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/version/a1g.py",
"encodingFormat": "application/x-python",
"programmingLanguage": {
"@type": "ComputerLanguage",
"name": "Python"
},
"codeRepository": "https://www.artikelschreiber.com/opensource/dowsery/100biere/demo/realdemo/"
}