-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #45 from bamr87:master
Clean up
- Loading branch information
Showing
34 changed files
with
17,275 additions
and
134 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"taxonomy":{"categories":[" ","-","H","Nany","Posts","T","a","cheetsheet","d","e","github","guides","home","j","k","l","machine-setup","n","n00b","notes","o","p","posts","quest","quests","quickstart","s","search","t","u","w","y"]}} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
import csv | ||
from collections import defaultdict | ||
|
||
# Specify the combined CSV file and output cleaned CSV file | ||
combined_csv_file = 'combined_output.csv' | ||
cleaned_csv_file = 'cleaned_output.csv' | ||
|
||
# Dictionary to store book titles and authors | ||
book_data = defaultdict(dict) | ||
|
||
# Read the combined CSV file and populate the book_data dictionary | ||
with open(combined_csv_file, 'r', newline='', encoding='utf-8') as csv_file: | ||
reader = csv.DictReader(csv_file) | ||
for row in reader: | ||
json_file = row["JSON_file"] | ||
mention_type = row["type"] | ||
mention_text = row["mentionText"] | ||
if mention_type == "book_title": | ||
book_data[json_file]["title"] = mention_text | ||
elif mention_type == "book_author": | ||
book_data[json_file]["author"] = mention_text | ||
|
||
# Write the cleaned records to the output CSV file | ||
with open(cleaned_csv_file, 'w', newline='', encoding='utf-8') as csv_file: | ||
field_names = ["JSON_file", "title", "author"] | ||
writer = csv.DictWriter(csv_file, fieldnames=field_names) | ||
writer.writeheader() | ||
|
||
for json_file, data in book_data.items(): | ||
writer.writerow({"JSON_file": json_file, "title": data.get("title", ""), "author": data.get("author", "")}) | ||
|
||
print("Cleaned CSV file creation completed.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
import csv | ||
import requests | ||
from difflib import get_close_matches | ||
|
||
# Specify the cleaned CSV file and output verified CSV file | ||
cleaned_csv_file = 'cleaned_output.csv' | ||
verified_csv_file = 'verified_output.csv' | ||
|
||
# Function to verify book title and author using the Open Library API | ||
def verify_with_open_library(title, author): | ||
api_url = f'http://openlibrary.org/search.json?title={title}&author={author}' | ||
response = requests.get(api_url) | ||
if response.status_code == 200: | ||
data = response.json() | ||
if data.get('num_found', 0) > 0: | ||
# Get details of the first matching book | ||
first_book = data['docs'][0] | ||
corrected_title = first_book.get('title', title) | ||
corrected_author = first_book.get('author_name', [author])[0] | ||
return corrected_title, corrected_author | ||
return None, None | ||
|
||
# Function to get ISBN based on verified title | ||
def get_isbn(title): | ||
api_url = f'http://openlibrary.org/search.json?title={title}' | ||
response = requests.get(api_url) | ||
if response.status_code == 200: | ||
data = response.json() | ||
if data.get('num_found', 0) > 0: | ||
# Get details of the first matching book | ||
first_book = data['docs'][0] | ||
identifiers = first_book.get('isbn', []) | ||
return identifiers[0] if identifiers else "ISBN Not Found" | ||
return "ISBN Not Found" | ||
|
||
# Read the cleaned CSV file and verify titles and authors | ||
with open(cleaned_csv_file, 'r', newline='', encoding='utf-8') as csv_file: | ||
reader = csv.DictReader(csv_file) | ||
|
||
# Open the verified CSV file for writing | ||
with open(verified_csv_file, 'w', newline='', encoding='utf-8') as verified_csv: | ||
field_names = ["JSON_file", "title", "author", "title_verified", "author_verified", "isbn"] | ||
writer = csv.DictWriter(verified_csv, fieldnames=field_names) | ||
writer.writeheader() | ||
|
||
for row in reader: | ||
json_file = row["JSON_file"] | ||
title = row["title"] | ||
author = row["author"] | ||
|
||
# Verify the title and author using the Open Library API | ||
corrected_title, corrected_author = verify_with_open_library(title, author) | ||
|
||
# If no match is found, use difflib to get the closest matches | ||
if corrected_title is None: | ||
closest_titles = get_close_matches(title, [book.strip() for book in row["title"].split(",")]) | ||
corrected_title = closest_titles[0] if closest_titles else "Not Found" | ||
|
||
if corrected_author is None: | ||
closest_authors = get_close_matches(author, [author.strip() for author in row["author"].split(",")]) | ||
corrected_author = closest_authors[0] if closest_authors else "Not Found" | ||
|
||
# Get ISBN based on the verified title | ||
isbn = get_isbn(corrected_title) | ||
|
||
# Write the verified data to the output CSV file | ||
writer.writerow({ | ||
"JSON_file": json_file, | ||
"title": title, | ||
"author": author, | ||
"title_verified": corrected_title, | ||
"author_verified": corrected_author, | ||
"isbn": isbn | ||
}) | ||
|
||
print("Verified CSV file creation completed.") |
Oops, something went wrong.