-
-
Notifications
You must be signed in to change notification settings - Fork 2.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fuzzy matching for journal abbreviations #12477
base: main
Are you sure you want to change the base?
Changes from 8 commits
9ce1f36
8c34a3a
0435f38
e241bf9
14aa1ba
acfd019
c6a0d0b
50a12c2
df16223
bd5b648
b63c073
8e62cdc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,17 +1,21 @@ | ||
package org.jabref.logic.journals; | ||
|
||
import java.io.Serial; | ||
import java.io.Serializable; | ||
import java.util.Objects; | ||
|
||
import org.apache.commons.text.similarity.LevenshteinDistance; | ||
|
||
public class Abbreviation implements Comparable<Abbreviation>, Serializable { | ||
|
||
@Serial | ||
private static final long serialVersionUID = 1; | ||
|
||
private transient String name; | ||
private final String abbreviation; | ||
private transient String dotlessAbbreviation; | ||
private static final LevenshteinDistance LEVENSHTEIN = LevenshteinDistance.getDefaultInstance(); | ||
|
||
// Is the empty string if not available | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are the comments removed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I’m sorry about that. This is my first open-source contribution! Personally, I often find comments to be annoying! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's alright. Read on why comments are used in code, and when they should be used. Furthermore, in open-source, whenever you touch existing code - know that it it was written by another developer and had gotten approved to be merged. So by default, any changes made to that (that don't pertain to the issue being solved) should be justified. |
||
private final transient String name; | ||
private final String abbreviation; | ||
private final transient String dotlessAbbreviation; | ||
private String shortestUniqueAbbreviation; | ||
|
||
public Abbreviation(String name, String abbreviation) { | ||
|
@@ -21,12 +25,12 @@ public Abbreviation(String name, String abbreviation) { | |
public Abbreviation(String name, String abbreviation, String shortestUniqueAbbreviation) { | ||
this(name, | ||
abbreviation, | ||
// "L. N." becomes "L N ", we need to remove the double spaces inbetween | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have you replaced the code directly from somewhere? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, but I removed some functions and rewrote them more than once, which caused changes to the code structure There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reverted |
||
abbreviation.replace(".", " ").replace(" ", " ").trim(), | ||
shortestUniqueAbbreviation.trim()); | ||
} | ||
|
||
private Abbreviation(String name, String abbreviation, String dotlessAbbreviation, String shortestUniqueAbbreviation) { | ||
private Abbreviation(String name, String abbreviation, String dotlessAbbreviation, | ||
String shortestUniqueAbbreviation) { | ||
subhramit marked this conversation as resolved.
Show resolved
Hide resolved
|
||
this.name = name.intern(); | ||
this.abbreviation = abbreviation.intern(); | ||
this.dotlessAbbreviation = dotlessAbbreviation.intern(); | ||
|
@@ -58,11 +62,19 @@ public String getDotlessAbbreviation() { | |
|
||
@Override | ||
public int compareTo(Abbreviation toCompare) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What you described in the PR description should appear as JavaDoc here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added DocStrings to the code. |
||
if (isSimilar(toCompare.getName())) { | ||
return 0; | ||
} | ||
|
||
int nameComparison = getName().compareTo(toCompare.getName()); | ||
if (nameComparison != 0) { | ||
return nameComparison; | ||
} | ||
|
||
if (isSimilar(toCompare.getAbbreviation())) { | ||
return 0; | ||
} | ||
|
||
int abbreviationComparison = getAbbreviation().compareTo(toCompare.getAbbreviation()); | ||
if (abbreviationComparison != 0) { | ||
return abbreviationComparison; | ||
|
@@ -75,8 +87,10 @@ public String getNext(String current) { | |
String currentTrimmed = current.trim(); | ||
|
||
if (getDotlessAbbreviation().equals(currentTrimmed)) { | ||
return getShortestUniqueAbbreviation().equals(getAbbreviation()) ? getName() : getShortestUniqueAbbreviation(); | ||
} else if (getShortestUniqueAbbreviation().equals(currentTrimmed) && !getShortestUniqueAbbreviation().equals(getAbbreviation())) { | ||
return getShortestUniqueAbbreviation().equals(getAbbreviation()) ? | ||
getName() : getShortestUniqueAbbreviation(); | ||
} else if (getShortestUniqueAbbreviation().equals(currentTrimmed) && | ||
!getShortestUniqueAbbreviation().equals(getAbbreviation())) { | ||
arraymahdi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return getName(); | ||
} else if (getName().equals(currentTrimmed)) { | ||
return getAbbreviation(); | ||
|
@@ -85,13 +99,16 @@ public String getNext(String current) { | |
} | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return "Abbreviation{name=%s, abbreviation=%s, dotlessAbbreviation=%s, shortestUniqueAbbreviation=%s}".formatted( | ||
this.name, | ||
this.abbreviation, | ||
this.dotlessAbbreviation, | ||
this.shortestUniqueAbbreviation); | ||
public boolean isSimilar(String otherName) { | ||
String normalizedThis = normalize(this.name); | ||
String normalizedOther = normalize(otherName); | ||
|
||
int distance = LEVENSHTEIN.apply(normalizedThis, normalizedOther); | ||
return distance <= 2; | ||
} | ||
|
||
private static String normalize(String input) { | ||
return input.toLowerCase().replaceAll("[^a-z0-9 ]", "").trim(); | ||
} | ||
|
||
@Override | ||
|
@@ -103,11 +120,22 @@ public boolean equals(Object o) { | |
return false; | ||
} | ||
Abbreviation that = (Abbreviation) o; | ||
return getName().equals(that.getName()) && getAbbreviation().equals(that.getAbbreviation()) && getShortestUniqueAbbreviation().equals(that.getShortestUniqueAbbreviation()); | ||
return Objects.equals(normalize(name), normalize(that.name)) && | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add some JavaDoc There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think, there should be no normalization here! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I reverted this change |
||
Objects.equals(normalize(abbreviation), normalize(that.abbreviation)) && | ||
Objects.equals(normalize(shortestUniqueAbbreviation), normalize(that.shortestUniqueAbbreviation)); | ||
} | ||
|
||
@Override | ||
public int hashCode() { | ||
return Objects.hash(getName(), getAbbreviation(), getShortestUniqueAbbreviation()); | ||
return Objects.hash(normalize(getName()), normalize(getAbbreviation()), normalize(getShortestUniqueAbbreviation())); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should the hashcode stay the same if two abbreviations have the same "normalized" name? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did this to make sure that both values consider the same There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From what I understand, there is no difference between the two names except for their case. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, but my question is, why does the Please do not mark discussions resolved until the person who asked the question does so. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, I'll revert the change! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reverted |
||
} | ||
|
||
@Override | ||
public String toString() { | ||
return "Abbreviation{name=%s, abbreviation=%s, dotlessAbbreviation=%s, shortestUniqueAbbreviation=%s}".formatted( | ||
this.name, | ||
this.abbreviation, | ||
this.dotlessAbbreviation, | ||
this.shortestUniqueAbbreviation); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -152,7 +152,7 @@ public void addCustomAbbreviations(Collection<Abbreviation> abbreviationsToAdd) | |
} | ||
|
||
public Optional<String> getNextAbbreviation(String text) { | ||
return get(text).map(abbreviation -> abbreviation.getNext(text)); | ||
return get(text).map(abbreviation -> abbreviation.getNext(text.trim())); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why a trim at the end but no trim at the beginning? Shoulnd't be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "Thank you very much for your insightful feedback! I’ve reviewed all your comments and did my best to implement them." |
||
} | ||
|
||
public Optional<String> getDefaultAbbreviation(String text) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ | |
|
||
import static org.junit.jupiter.api.Assertions.assertEquals; | ||
import static org.junit.jupiter.api.Assertions.assertNotEquals; | ||
import static org.junit.jupiter.api.Assertions.assertTrue; | ||
|
||
class AbbreviationTest { | ||
|
||
|
@@ -111,10 +112,46 @@ void equals() { | |
assertNotEquals("String", abbreviation); | ||
} | ||
|
||
@Test | ||
void testSlightDifferences() { | ||
assertTrue(new Abbreviation("Long Name", "L. N.").isSimilar("Longn Name")); | ||
} | ||
|
||
@Test | ||
void testMissingLetter() { | ||
assertTrue(new Abbreviation("Long Name", "L. N.").isSimilar("Long ame")); | ||
} | ||
|
||
@Test | ||
void testPunctuationDifferences() { | ||
assertTrue(new Abbreviation("Long Name", "L. N.").isSimilar("Long, Name")); | ||
} | ||
|
||
@Test | ||
void testCaseDifferences() { | ||
assertTrue(new Abbreviation("Long Name", "L. N.").isSimilar("LONG NAME")); | ||
} | ||
|
||
@Test | ||
void equalAbbrevationsWithFourComponentsAreAlsoCompareZero() { | ||
Abbreviation abbreviation1 = new Abbreviation("Long Name", "L. N.", "LN"); | ||
Abbreviation abbreviation2 = new Abbreviation("Long Name", "L. N.", "LN"); | ||
assertEquals(0, abbreviation1.compareTo(abbreviation2)); | ||
} | ||
|
||
@Test | ||
void testCompareToWithFuzzyMatching() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please convert to parameterized Test There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added a parameterized for compareTo |
||
Abbreviation abbreviation1 = new Abbreviation("Long Name", "L. N."); | ||
Abbreviation abbreviation2 = new Abbreviation("Long Name", "L. N."); | ||
assertEquals(0, abbreviation1.compareTo(abbreviation2)); | ||
|
||
abbreviation2 = new Abbreviation("Long Name", "L. N. "); | ||
assertEquals(0, abbreviation1.compareTo(abbreviation2)); | ||
|
||
abbreviation2 = new Abbreviation("Long Name", "L. N"); | ||
assertEquals(0, abbreviation1.compareTo(abbreviation2)); | ||
|
||
abbreviation2 = new Abbreviation("Short Name", "S. N."); | ||
assertNotEquals(0, abbreviation1.compareTo(abbreviation2)); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a specific reason why the order was changed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, but I removed some functions and rewrote them, which caused changes to the code structure
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't see how that should remove class fields, but alright.