PYTHON

Fetch All Pages from a Paginated REST API

Efficiently retrieve all data from a paginated REST API endpoint using a recursive fetching strategy in Python, handling 'next page' links or page numbers.

import requests
import time

def fetch_paginated_data(base_url, params=None, all_data=None, page_key='page', next_page_link_key='next'):
    """
    Recursively fetches all data from a paginated API.

    Args:
        base_url (str): The base URL of the API endpoint.
        params (dict, optional): Initial query parameters for the first request. Defaults to None.
        all_data (list, optional): Accumulator for all fetched data. Defaults to None.
        page_key (str): The query parameter name for pagination (e.g., 'page', 'offset').
        next_page_link_key (str): The key in the response that contains the URL to the next page,
                                   or 'None' if using page numbers.

    Returns:
        list: A list containing all items fetched from all pages.
    """
    if all_data is None:
        all_data = []

    if params is None:
        params = {}

    print(f"Fetching: {base_url} with params: {params}")
    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()  # Raise an exception for HTTP errors (4xx or 5xx)
        data = response.json()

        # Assuming the items are in a list directly or under a 'results' key
        items = data.get('results', data) # Adjust 'results' based on your API's structure

        if isinstance(items, list):
            all_data.extend(items)
        else:
            # If the API returns a single object per page, or the structure is different
            print("Warning: API response not a list. Check 'items' extraction logic.")
            all_data.append(items) # Or handle as appropriate

        # --- Pagination Logic ---
        # Strategy 1: Using a 'next' link in the response
        next_page_url = data.get(next_page_link_key)
        if next_page_url and next_page_link_key != 'None':
            # Extract new base_url and params from the next_page_url
            from urllib.parse import urlparse, parse_qs
            parsed_url = urlparse(next_page_url)
            new_base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
            new_params = {k: v[0] for k, v in parse_qs(parsed_url.query).items()}
            
            # Recursive call with new URL and parameters
            time.sleep(0.5) # Be kind to the API
            return fetch_paginated_data(new_base_url, new_params, all_data, page_key, next_page_link_key)

        # Strategy 2: Using page numbers (if next_page_link_key is 'None' or not found)
        # This requires knowing the total pages or checking if the current page has results
        # This example focuses on 'next_page_link_key' but can be adapted
        current_page = params.get(page_key, 1)
        if not next_page_url and len(items) > 0 and next_page_link_key == 'None':
            # This is a simplified logic for page numbers, assuming a `page` parameter
            # You might need `total_pages` or `has_more` from the API response
            new_params = params.copy()
            new_params[page_key] = current_page + 1
            time.sleep(0.5) # Be kind to the API
            return fetch_paginated_data(base_url, new_params, all_data, page_key, next_page_link_key)
        
        return all_data

    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return all_data # Return what was fetched so far, or re-raise

# --- Example Usage ---
# Assuming an API like https://api.example.com/items?page=1 or with a 'next' link

# Example 1: API using 'page' query parameter
# total_items = fetch_paginated_data(
#     'https://jsonplaceholder.typicode.com/posts',
#     params={'_limit': 10}, # Assuming an API that responds to _page and _limit for pagination
#     page_key='_page',
#     next_page_link_key='None' # Indicate no 'next' link in response, rely on page numbers
# )
# print(f"Fetched {len(total_items)} items.")
# print(total_items[0]) # First item
# print(total_items[-1]) # Last item

# Example 2: API using a 'next' link in the response (e.g., GitHub API)
# For a real GitHub API, you'd fetch repo issues and it provides `next` in link headers
# Here's a simplified simulation using a known paginated endpoint if possible,
# or conceptual for an API that has 'next' key in JSON body
# total_users = fetch_paginated_data(
#     'https://reqres.in/api/users', # A demo API with 'page' and 'next' properties
#     params={'page': 1},
#     page_key='page',
#     next_page_link_key='next_page' # This is a conceptual key, adjust based on actual API
# )
# print(f"Fetched {len(total_users)} users.")
# if total_users:
#     print(total_users[0])

How it works: This Python snippet demonstrates how to programmatically fetch all available data from a REST API that uses pagination. It uses a recursive approach to follow 'next page' links typically found in API responses (either in the body or headers) or to increment a page number parameter until no more data is returned. This ensures that a complete dataset is retrieved, overcoming the limitations of single-page API calls. A small `time.sleep` is included to prevent aggressive hammering of the API.

Fetch All Pages from a Paginated REST API

Related PYTHON Snippets

Performing Efficient Set Operations in Python

Advanced Dictionary Merging and Update Strategies in Python

Efficient Data Transformation with Python List and Generator Expressions

Need help integrating this into your project?