Merge pull request #68 from DerwenAI/update

Update
DerwenAI · Sep 15, 2020 · 404ddef · 404ddef
2 parents bce3938 + e4885fc
commit 404ddef
Show file tree

Hide file tree

Showing 5 changed files with 68 additions and 26 deletions.
diff --git a/README.md b/README.md
@@ -5,8 +5,8 @@
 used to:
 
   - extract the top-ranked phrases from text documents
-  - infer links from unstructured text into structured data
-  - run extractive summarization of text documents
+  - run low-cost extractive summarization of text documents
+  - help infer links from unstructured text into structured data
 
 ## Background
 
@@ -198,7 +198,7 @@ title = {PyTextRank, a Python implementation of TextRank for phrase extraction a
 
 ## Kudos
 
-Many thanks to contributors:
+Many thanks to our contributors:
 [@htmartin](https://github.com/htmartin),
 [@williamsmj](https://github.com/williamsmj/),
 [@mattkohl](https://github.com/mattkohl),
@@ -214,6 +214,7 @@ Many thanks to contributors:
 [@jake-aft](https://github.com/jake-aft),
 [@junchen1992](https://github.com/junchen1992),
 [@Ankush-Chander](https://github.com/Ankush-Chander),
+[@shyamcody](https://github.com/shyamcody),
 encouragement from the wonderful folks at [spaCy](https://github.com/explosion/spaCy),
 plus general support from [Derwen, Inc.](https://derwen.ai/)
 

diff --git a/changelog.txt b/changelog.txt
@@ -1,5 +1,13 @@
 # PyTextRank changelog
 
+## 2.0.3
+
+2020-09-15
+
+  * try-catch `ZeroDivisionError` in summary method -- kudos @shyamcody
+  * tested with updated dependencies: `spaCy` 2.3.x and `NetworkX` 2.5
+
+
 ## 2.0.2
 
 2020-05-20
@@ -12,7 +20,7 @@
 2020-03-02
 
   * fix `KeyError` issue for pre Python 3.6
-  * integrated codecov.io
+  * integrated `codecov.io`
   * added PyTextRank to the spaCy uniVerse
   * fixed README.md instructions to download `en_core_web_sm`
 
@@ -21,7 +29,7 @@
 
 2019-11-05
 
-  * refactored library to run as a spaCy extension
+  * refactored library to run as a `spaCy` extension
   * supports multiple languages
   * significantly faster, with less memory required
   * better extraction of top-ranked phrases
@@ -41,31 +49,31 @@
 
 2019-11-01
 
- * updated to fix for current versions of `spaCy` and `NetworkX` -- kudos @dimmu
- * removed deprecated argument -- kudos @laxatives
+  * updated to fix for current versions of `spaCy` and `NetworkX` -- kudos @dimmu
+  * removed deprecated argument -- kudos @laxatives
 
 
 ## 1.1.1
 
 2017-09-15
 
-  * patch disables use of NER in spaCy until an intermittent bug is resolved.
-  * will probably replace named tuples with spaCy spans instead.
+  * patch disables use of NER in `spaCy` until an intermittent bug is resolved.
+  * will probably replace named tuples with `spaCy` spans instead.
 
 
 ## 1.1.0
 
 2017-06-07
 
- * replaced use of TextBlob with spaCy
- * updated other Py dependencies
- * better handling for UTF-8
+  * replaced use of `TextBlob` with `spaCy`
+  * updated other Py dependencies
+  * better handling for UTF-8
 
 
 ## 1.0.1
 
 2017-04-30
 
- * updated Jupyter notebook example -- kudos @kjam
- * better install/import for aptagger
- * comparing spaCy performance with TextBlob
+  * updated Jupyter notebook example -- kudos @kjam
+  * better install/import for `aptagger`
+  * comparing `spaCy` performance with `TextBlob`
diff --git a/pytextrank/pytextrank.py b/pytextrank/pytextrank.py
@@ -100,9 +100,11 @@ def maniacal_scrubber (text):
 
     x = str(unicodedata.normalize("NFKD", x).encode("ascii", "ignore").decode("utf-8"))
 
-    # some web content returns "not string" ?? ostensibly no longer
-    # possibl in Py 3.x but crazy "mixed modes" of character encodings
-    # have been found in the wild -- YMMV
+    # some web content returns "not string" ??
+    #
+    # ostensibly that's no longer possible in Py 3.x even so some
+    # crazy-making "mixed modes" of character encodings have been
+    # found in the wild -- YMMV
 
     try:
         assert type(x).__name__ == "str"
@@ -290,7 +292,7 @@ def link_sentence (self, sent):
                     continue
 
                 # ...otherwise proceed
-                key = (token.lemma_, token.pos_)
+                key = (token.lemma_, token.pos_,)
 
                 if key not in self.seen_lemma:
                     self.seen_lemma[key] = set([token.i])
@@ -378,9 +380,9 @@ def calc_textrank (self):
         if self.logger:
             self.logger.debug(self.seen_lemma)
 
-        # to run the algorithm, we use PageRank – i.e., approximating
-        # eigenvalue centrality – to calculate ranks for each of the
-        # nodes in the lemma graph
+        # to run the algorithm, we use the NetworkX implementation of
+        # PageRank – i.e., approximating eigenvalue centrality – to
+        # calculate ranks for each of the nodes in the lemma graph
 
         self.ranks = nx.pagerank(self.lemma_graph)
 
@@ -393,6 +395,24 @@ def calc_textrank (self):
         for ent in self.doc.ents:
             self.collect_phrases(ent)
 
+        # TODO:
+        # there are edge cases where the built-in noun chunking in
+        # spaCy fails to extract much, for example:
+
+        # > "everything you need to know about student loan interest rates variable and fixed rates capitalization amortization student loan refinancing and more."
+
+        # we should test `len(self.phrases.keys())` vs. a brute-force
+        # noun chunking approach, then add the brute-force override to
+        # `self.phrases`
+
+        #for k, p in self.phrases.items():
+        #    print(" >>", k, p)
+
+        #for key, lemma in self.seen_lemma.items():
+        #    node_id = list(self.seen_lemma.keys()).index(key)
+        #    print(node_id, key, lemma, self.ranks[node_id])
+
+
         # since noun chunks can be expressed in different ways (e.g., may
         # have articles or prepositions), we need to find a minimum span
         # for each phrase based on combinations of lemmas
@@ -484,7 +504,11 @@ def summary (self, limit_phrases=10, limit_sentences=4):
         # the requested limit
 
         sum_ranks = sum(unit_vector)
-        unit_vector = [ rank/sum_ranks for rank in unit_vector ]
+
+        try:
+            unit_vector = [ rank/sum_ranks for rank in unit_vector ]
+        except ZeroDivisionError:
+            unit_vector = (0.0,) * len(unit_vector)
 
         # iterate through each sentence, calculating its euclidean
         # distance from the unit vector

diff --git a/setup.py b/setup.py
@@ -5,10 +5,10 @@
 
 setuptools.setup(
     name="pytextrank",
-    version="2.0.2",
+    version="2.0.3",
     author="Paco Xander Nathan",
     author_email="[email protected]",
-    description="Python implementation of TextRank for phrase extraction and summarization of text documents",
+    description="Python implementation of TextRank for phrase extraction and lightweight summarization of text documents",
     long_description=long_description,
     long_description_content_type="text/markdown",
     url="http://github.com/DerwenAI/pytextrank",
@@ -36,7 +36,7 @@
           "networkx",
           "spacy",
     ],
-    keywords="textrank, spacy, phrase extraction, parsing, extractive summarization, natural language processing, nlp, knowledge graph, graph algorithms, text analytics",
+    keywords="textrank, spacy, phrase extraction, parsing, natural language processing, nlp, knowledge graph, graph algorithms, text analytics, extractive summarization",
     license="MIT",
     zip_safe=False,
 )
diff --git a/test.py b/test.py
@@ -43,5 +43,14 @@ def test_enable_disable_pipeline (self):
             assert len(doc._.phrases) == 0
 
 
+    def test_noun_chunk_fails (self):
+        text = "everything you need to know about student loan interest rates variable and fixed rates capitalization amortization student loan refinancing and more."
+        doc = self.nlp(text)
+        phrases = [ p.text for p in doc._.phrases ]
+
+        print("\nUSING: |{}|\n  =>{}".format(text, phrases))
+        self.assertTrue(len(doc._.phrases) >= 0)
+
+
 if __name__ == "__main__":
     unittest.main()