-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path1_scopusretrieval.py
129 lines (93 loc) · 4.32 KB
/
1_scopusretrieval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
## Oskar, EPS, scopuscall
### Script 1: SCOPUS SEARCH ############################
# I) Send search query to scopus for literature
# II) Get subject areas for each document (another Scopus API)
#####################################
import pandas as pd
import pybliometrics.scopus as scopus
from scopusquery import EPSq
import html
import requests
#################
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows',None)
pd.set_option('display.max_colwidth', 150)
#import locale
#locale.getpreferredencoding()
################ GET results
from pybliometrics.scopus.utils import config
print(config['Proxy']['ftp']) # Show keys
with requests.Session() as s:
s = scopus.ScopusSearch(EPSq, download=True, subscriber=True, refresh=True, verbose=True) # set download=True for fast, but result size only
s.get_results_size()
# must be on icta network/vpn to DL more than 5K results.
raw = pd.DataFrame(pd.DataFrame(s.results))
raw.head()
raw.columns
####### clean important columns
strcols = ['title', 'authkeywords', 'description']
pattern = '|'.join(['<inf>', '</inf>','<sup>', '</sup>'])
for colname in strcols:
# removes html tags and weird ampersands
raw[colname] = [i if isinstance(i, str) else "0" for i in raw[colname]]
raw[colname] = [html.unescape(stryng) for stryng in raw[colname]] # removes ampersands: "&"
raw[colname] = raw[colname].str.replace(pattern, "", regex=True)
raw[colname] = [stryng.strip() for stryng in raw[colname]]# removes trails
raw[colname] = [stryng.strip("'") for stryng in raw[colname]]# removes quoted titles
raw[colname] = [stryng.strip('"') for stryng in raw[colname]]# removes quoted titles
### proper date column and a articles-pr-journal-count
raw['date'] = pd.to_datetime(raw.coverDate)
raw['pubyear'] = raw['date'].dt.year
raw['journal_count']= raw.publicationName.map(raw.publicationName.value_counts())
wantedfields = ['eid', 'doi', 'pii', 'title', 'subtype', 'affilname','author_count', 'author_names','publicationName', 'issn', 'source_id', 'aggregationType', 'description', 'authkeywords', 'citedby_count', 'date', 'pubyear', 'journal_count']
df = raw[wantedfields]
len(df)
# save
len(df)
df.to_csv("data/0raw_nofilter_v1.csv", encoding='utf-8-sig') # "-sig" adds BOM bc. Excel
df = pd.read_csv("data/0raw_nofilter_v1.csv")
############# APIs for subject areas
# https://service.elsevier.com/app/answers/detail/a_id/15181/supporthub/scopus/
# https://service.elsevier.com/app/answers/detail/a_id/14882/supporthub/scopus/related/1/ What are Scopus Field Codes?
from pybliometrics.scopus import AbstractRetrieval # searches for subject for given article
#print(pybliometrics.__version__) # dev version has sessions function. 3.3.0 does not
# SerialSearch cannot be used on titles, because it isnt exact search. issn doesn't give full results.
# SerialTitle is only on Issn/e-issn
from time import sleep
rel_results_eid = df.eid.unique()
len(rel_results_eid)
unable_eids = []
eid_subjects = []
for i, eid in enumerate(rel_results_eid):
try:
st = AbstractRetrieval(f'{eid}', view='FULL', id_type='eid')
xtz = st.subject_areas
subjarea = [i.area for i in xtz]
subjabbr = [i.abbreviation for i in xtz]
subjcode = [i.code for i in xtz]
eid_subjects.append([eid, st.publicationName, subjabbr, subjarea, subjcode])
if (i+1)%900 == 0:
sleep(61)
except:
unable_eids.append(eid)
len(unable_eids)
retry_unable_eids = []
for i, un_eid in enumerate(unable_eids):
try:
st = AbstractRetrieval(f'{un_eid}', view='FULL', id_type='eid')
xtz = st.subject_areas
subjarea = [i.area for i in xtz]
subjabbr = [i.abbreviation for i in xtz]
subjcode = [i.code for i in xtz]
retry_unable_eids.append([eid, st.publicationName, subjabbr, subjarea, subjcode])
except Exception as e:
print("did not")
print(un_eid, e)
len(eid_subjects)
len(retry_unable_eids)
all_subject_data = eid_subjects + retry_unable_eids
#lost one. EID could not be found: "2-s2.0-0032010175"
df_subj = pd.DataFrame(all_subject_data, columns=['eid', 'pubname', 'subjabbr', 'subjarea','subjcode'])
len(df)
df_subj.to_csv("data/0subjarea_data_nofilter.csv", encoding='utf-8-sig') # "-sig" adds BOM bc. Excel
############################################################