Skip to content

Commit

Permalink
Update extractIdentifiers #595
Browse files Browse the repository at this point in the history
  • Loading branch information
tnajdek committed Feb 6, 2025
1 parent 091b7fa commit 97463ae
Show file tree
Hide file tree
Showing 6 changed files with 81 additions and 21 deletions.
90 changes: 75 additions & 15 deletions src/js/common/identifiers.js
Original file line number Diff line number Diff line change
@@ -1,15 +1,67 @@
const cleanDOI = text => {
const doi = text.match(/10(?:\.[0-9]{4,})?\/[^\s]*[^\s\.,]/);
return doi ? doi[0] : null;
/* eslint-disable no-useless-escape */
/* eslint-disable no-cond-assign */
// `cleanISBN`, `cleanDOI` and `extractIdentifiers` adapted from https://github.com/zotero/utilities/blob/43142236a282e5e1a3190694628f329aa2e0ba8e/utilities.js

/**
* Strip info:doi prefix and any suffixes from a DOI
* @type String
*/
function cleanDOI(/**String**/ x) {
if (typeof (x) != "string") {
throw new Error("cleanDOI: argument must be a string");
}
// If it's a URL, try to decode it
if (/^https?:/.test(x)) {
try {
x = decodeURIComponent(x);
}
catch (e) {
// URI contains an invalid escape sequence
console.warn("Not decoding URL-like DOI because of invalid escape sequence: " + x);
}
}
// Even if it's not a URL, decode %3C followed by %3E as < >
var openingPos = x.indexOf("%3C");
if (openingPos != -1 && openingPos < x.indexOf("%3E")) {
x = x.replace(/%3C/g, "<");
x = x.replace(/%3E/g, ">");
}
var doi = x.match(/10(?:\.[0-9]{4,})?\/[^\s]*[^\s\.,]/);
if (!doi) {
return null;
}
var result = doi[0];

// Check if the DOI ends with a bracket
var trailingBracket = result.slice(-1);
if ([']', ')', '}'].includes(trailingBracket)) {
// Check the portion of the string before the matched DOI for an unclosed bracket
let beforeDOI = x.slice(0, doi.index);
let openingBracket = {
']': '[',
')': '(',
'}': '{'
}[trailingBracket];
if (beforeDOI.lastIndexOf(openingBracket) > beforeDOI.lastIndexOf(trailingBracket)) {
// Remove the trailing bracket from the DOI
result = result.slice(0, -1);
}
}
return result;
}

const cleanISBN = (isbnStr, dontValidate) => {
/**
* Clean and validate ISBN.
* Return isbn if valid, otherwise return false
* @param {String} isbn
* @param {Boolean} [dontValidate=false] Do not validate check digit
* @return {String|Boolean} Valid ISBN or false
*/
function cleanISBN(isbnStr, dontValidate) {
isbnStr = isbnStr.toUpperCase()
.replace(/[\x2D\xAD\u2010-\u2015\u2043\u2212]+/g, ''); // Ignore dashes
var isbnRE = /\b(?:97[89]\s*(?:\d\s*){9}\d|(?:\d\s*){9}[\dX])\b/g,
isbnMatch;

// eslint-disable-next-line no-cond-assign
while (isbnMatch = isbnRE.exec(isbnStr)) {
var isbn = isbnMatch[0].replace(/\s+/g, '');

Expand Down Expand Up @@ -43,10 +95,9 @@ const cleanISBN = (isbnStr, dontValidate) => {
return false;
}

// https://github.com/zotero/zotero/blob/57989260935703f0c7d570a39bcf6516b8c61df6/chrome/content/zotero/xpcom/utilities_internal.js#L1409
const extractIdentifiers = text => {
const identifiers = [];
const foundIDs = new Set(); // keep track of identifiers to avoid duplicates
function extractIdentifiers(text) {
var identifiers = [];
var foundIDs = new Set(); // keep track of identifiers to avoid duplicates

// First look for DOIs
var ids = text.split(/[\s\u00A0]+/); // whitespace + non-breaking space
Expand All @@ -67,8 +118,6 @@ const extractIdentifiers = text => {
.toUpperCase();
let ISBN_RE = /(?:\D|^)(97[89]\d{10}|\d{9}[\dX])(?!\d)/g;
let isbn;

// eslint-disable-next-line no-cond-assign
while (isbn = ISBN_RE.exec(ids)) {
isbn = cleanISBN(isbn[1]);
if (isbn && !foundIDs.has(isbn)) {
Expand All @@ -82,8 +131,6 @@ const extractIdentifiers = text => {
// Next try spaces
if (!identifiers.length) {
ids = ids.replace(/[ \u00A0]+/g, ""); // space + non-breaking space

// eslint-disable-next-line no-cond-assign
while (isbn = ISBN_RE.exec(ids)) {
isbn = cleanISBN(isbn[1]);
if (isbn && !foundIDs.has(isbn)) {
Expand Down Expand Up @@ -112,7 +159,20 @@ const extractIdentifiers = text => {
}
}

// Finally try for PMID
// Next, try ADS Bibcodes
if (!identifiers.length) {
// regex as in the ADS Bibcode translator
let adsBibcode_RE = /\b(\d{4}\D\S{13}[A-Z.:])\b/g;
let adsBibcode;
while ((adsBibcode = adsBibcode_RE.exec(text)) && !foundIDs.has(adsBibcode)) {
identifiers.push({
adsBibcode: adsBibcode[1]
});
foundIDs.add(adsBibcode);
}
}

// Finally, try PMID
if (!identifiers.length) {
// PMID; right now, the longest PMIDs are 8 digits, so it doesn't seem like we'll
// need to discriminate for a fairly long time
Expand Down
2 changes: 1 addition & 1 deletion src/js/component/item/actions/add-by-identifier.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ const AddByIdentifier = props => {
<div className="popover-inner" role="tooltip">
<h3 className="popover-header">
<label htmlFor={ `${id.current}-input` }>
Enter a URL, ISBNs, DOIs, PMIDs, or arXiv IDs
Enter a URL, ISBNs, DOIs, PMIDs, arXiv IDs, or ADS Bibcodes to add to your library:
</label>
</h3>
<div className="popover-body">
Expand Down
2 changes: 1 addition & 1 deletion src/js/component/modal/add-by-identifier.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ const AddByIdentifierModal = () => {
onChange={ handleInputChange }
onCommit={ handleInputCommit }
onPaste={ handlePaste }
placeholder="URL, ISBNs, DOIs, PMIDs, or arXiv IDs"
placeholder="URL, ISBNs, DOIs, PMIDs, arXiv IDs, or ADS Bibcodes"
ref={ inputEl }
tabIndex={ 0 }
value={ identifier }
Expand Down
2 changes: 1 addition & 1 deletion src/js/component/modal/create-parent-item.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ const CreateParentItemModal = () => {
>
<h3>
<label htmlFor={`${id}-input`}>
Enter a DOI, ISBN, PMID, or arXiv IDs to identify this file
Enter a DOI, ISBN, PMID, arXiv ID, or ADS Bibcode to identify this file:
</label>
</h3>
<div className="form">
Expand Down
2 changes: 1 addition & 1 deletion test/items.test.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ describe('Items', () => {
);

const input = screen.getByRole('textbox',
{ name: 'Enter a URL, ISBNs, DOIs, PMIDs, or arXiv IDs' }
{ name: 'Enter a URL, ISBNs, DOIs, PMIDs, arXiv IDs, or ADS Bibcodes to add to your library:' }
);

await userEvent.type(
Expand Down
4 changes: 2 additions & 2 deletions test/parent.test.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,8 @@ describe('Create Parent Item', () => {
await waitForPosition();
await user.click(screen.getByRole('menuitem', { name: 'Create Parent Item' }));
const dialog = await screen.findByRole('dialog', { name: 'Create Parent Item' });
await waitFor(() => expect(getByRole(dialog, 'textbox', { name: 'Enter a DOI, ISBN, PMID, or arXiv IDs to identify this file' })).toHaveFocus());
const input = getByRole(dialog, 'textbox', { name: 'Enter a DOI, ISBN, PMID, or arXiv IDs to identify this file' });
await waitFor(() => expect(getByRole(dialog, 'textbox', { name: 'Enter a DOI, ISBN, PMID, arXiv ID, or ADS Bibcode to identify this file:' })).toHaveFocus());
const input = getByRole(dialog, 'textbox', { name: 'Enter a DOI, ISBN, PMID, arXiv ID, or ADS Bibcode to identify this file:' });
await user.type(input, '1706.03762{enter}', { skipClick: true });

expect(await screen.findByRole('row', { name: 'Attention Is All You Need' })).toBeInTheDocument();
Expand Down

0 comments on commit 97463ae

Please sign in to comment.