Here is a script for fetching all records from an OAI-PMH repository using Python. I hope it serves as a useful reference.
import requests
from requests import Request
import xml.etree.ElementTree as ET
# Define the endpoint
base_url = 'https://curation.library.t.u-tokyo.ac.jp/oai'
# Initial OAI-PMH request
params = {
'verb': 'ListRecords',
'metadataPrefix': 'curation',
'set': '97590'
}
response = requests.get(base_url, params=params)
# Prepare the initial request
req = Request('GET', base_url,params=params)
prepared_req = req.prepare()
print("Sending request to:", prepared_req.url) # Output the URL
root = ET.fromstring(response.content)
data = []
# Fetch all data
while True:
# Process records
for record in root.findall('.//{http://www.openarchives.org/OAI/2.0/}record'):
identifier = record.find('.//{http://www.openarchives.org/OAI/2.0/}identifier').text
print(f'Record ID: {identifier}')
# Other data can be processed here as well
data.append(record)
# Get resumptionToken and execute next request
token_element = root.find('.//{http://www.openarchives.org/OAI/2.0/}resumptionToken')
if token_element is None or not token_element.text:
break # End loop if no token
params = {
'verb': 'ListRecords',
'resumptionToken': token_element.text
}
response = requests.get(base_url, params=params)
root = ET.fromstring(response.content)
print("All records have been fetched.")
print(len(data))