Fetching All Records from an OAI-PMH Repository Using Python

Here is a script for fetching all records from an OAI-PMH repository using Python. I hope it serves as a useful reference.

import requests
from requests import Request
import xml.etree.ElementTree as ET

# Define the endpoint
base_url = 'https://curation.library.t.u-tokyo.ac.jp/oai'

# Initial OAI-PMH request
params = {
    'verb': 'ListRecords',
    'metadataPrefix': 'curation',
    'set': '97590'
}

response = requests.get(base_url, params=params)

# Prepare the initial request
req = Request('GET', base_url,params=params)
prepared_req = req.prepare()
print("Sending request to:", prepared_req.url)  # Output the URL

root = ET.fromstring(response.content)

data = []

# Fetch all data
while True:
    # Process records
    for record in root.findall('.//{http://www.openarchives.org/OAI/2.0/}record'):
        identifier = record.find('.//{http://www.openarchives.org/OAI/2.0/}identifier').text
        print(f'Record ID: {identifier}')
        # Other data can be processed here as well

        data.append(record)

    # Get resumptionToken and execute next request
    token_element = root.find('.//{http://www.openarchives.org/OAI/2.0/}resumptionToken')
    if token_element is None or not token_element.text:
        break  # End loop if no token

    params = {
        'verb': 'ListRecords',
        'resumptionToken': token_element.text
    }
    response = requests.get(base_url, params=params)
    root = ET.fromstring(response.content)

print("All records have been fetched.")

print(len(data))