Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 6 additions & 22 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
name: Run Pre-recorded Tests

on:
pull_request:
branches:
- master
push:
branches:
- master

jobs:
run-tests:
runs-on: ubuntu-latest
Expand All @@ -17,40 +15,26 @@ jobs:
steps:
- name: Check out code
uses: actions/checkout@v6

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v6
with:
python-version: ${{ matrix.python-version }}

- name: Install dependencies
run: |
pip install vcrpy pytest==7.4.2 requests pytest-mock python-documentcloud pytest-xdist pytest-recording python-squarelet

run: pip install -e ".[test]"
- name: Run pre-recorded tests
run: |
make test
working-directory: .

run: make test
pylint-and-black:
runs-on: ubuntu-latest
steps:
- name: Check out code
uses: actions/checkout@v6

- name: Set up Python 3.11
uses: actions/setup-python@v6
with:
python-version: "3.11"

- name: Install dependencies for imports
run: |
pip install python-dateutil requests urllib3 fastjsonschema ratelimit listcrunch pyyaml pytest vcrpy python-squarelet

- name: Install pylint and black
run: |
pip install pylint black

- name: Install dependencies
run: pip install -e ".[dev,test]"
- name: Run pylint and black on ./documentcloud and ./tests
run: |
pylint ./documentcloud ./tests; black --check ./documentcloud ./tests
pylint ./documentcloud ./tests
black --check ./documentcloud ./tests
4 changes: 4 additions & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
Changelog
---------

4.7.0
~~~~~
* Added burst-based sane rate limits to several endpoints.

4.6.0
~~~~~
* Added load_run_data and store_run_data on the Add-On class to access AddOn run data.
Expand Down
4 changes: 2 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,9 @@
# built documents.
#
# The short X.Y version.
version = "4.6"
version = "4.7"
# The full version, including alpha/beta/rc tags.
release = "4.6.0"
release = "4.7.0"

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
46 changes: 44 additions & 2 deletions documentcloud/client.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,35 @@
# Import SquareletClient from python-squarelet
# Standard Library
import logging
import time

# Third Party
import token_bucket
from squarelet import SquareletClient

# Local
# Local Imports
from .documents import DocumentClient
from .organizations import OrganizationClient
from .projects import ProjectClient
from .users import UserClient

logger = logging.getLogger("documentcloud")

# Per-endpoint rate limits applied on top of the global squarelet limit.
# Format: (method, url_pattern, rate_per_second, capacity)
#
# Endpoint Rate Burst Notes
# -------- ---- ----- -----
# GET documents/search 15/min 50
# POST documents/ 12/min 100 25 docs/bulk call = up to 300 docs/min
# PUT documents/ 12/min 100 25 docs/bulk call = up to 300 docs/min
# GET files/ 15/min 100 PDFs, full text, and other private assets
ENDPOINT_RATE_LIMITS = [
("GET", "documents/search", 15 / 60, 50),
("POST", "documents/", 12 / 60, 100),
("PUT", "documents/", 12 / 60, 100),
("GET", "files/", 15 / 60, 100),
]


class DocumentCloud(SquareletClient):
"""
Expand Down Expand Up @@ -51,8 +67,34 @@ def __init__(
else:
logger.addHandler(logging.NullHandler())

# Build per-endpoint token bucket rate limiters
storage = token_bucket.MemoryStorage()
self._endpoint_limiters = [
(
pattern_method,
pattern,
token_bucket.Limiter(rate=rate, capacity=capacity, storage=storage),
f"{pattern_method}:{pattern}",
)
for pattern_method, pattern, rate, capacity in ENDPOINT_RATE_LIMITS
]

# Initialize the sub-clients using SquareletClient
self.documents = DocumentClient(self)
self.projects = ProjectClient(self)
self.users = UserClient(self)
self.organizations = OrganizationClient(self)

def request(self, method, url, raise_error=True, **kwargs):
for pattern_method, pattern, limiter, bucket_key in self._endpoint_limiters:
if pattern_method.upper() == method.upper() and pattern in url:
if not limiter.consume(bucket_key):
logger.warning(
"Rate limit reached for %s %s, throttling...",
method.upper(),
pattern,
)
while not limiter.consume(bucket_key):
time.sleep(0.1)
return super().request(method, url, raise_error=raise_error, **kwargs)
return super().request(method, url, raise_error=raise_error, **kwargs)
35 changes: 32 additions & 3 deletions documentcloud/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@
import logging
import os
import re
import time
import warnings
from functools import partial
from urllib.parse import urlparse

# Third Party
import token_bucket
from requests.exceptions import RequestException

# Local
Expand All @@ -28,6 +30,8 @@

IMAGE_SIZES = ["thumbnail", "small", "normal", "large", "xlarge"]

DEFAULT_USER_AGENT = "python-documentcloud"


class Document(BaseAPIObject):
"""A single DocumentCloud document"""
Expand Down Expand Up @@ -164,12 +168,17 @@ def _get_url(self, url, fmt=None):

if base_netloc == url_netloc:
# if the url host is the same as the base api host,
# sent the request with the client in order to include
# send the request with the client in order to include
# authentication credentials
response = self._client.get(url, full_url=True)
else:
response = requests_retry_session().get(
url, headers={"User-Agent": "python-documentcloud2"}
response = self._client.documents.asset_get(
url,
headers={
"User-Agent": self._client.session.headers.get(
"User-Agent", DEFAULT_USER_AGENT
)
},
)
if fmt == "text":
return response.content.decode("utf8")
Expand Down Expand Up @@ -246,6 +255,26 @@ class DocumentClient(BaseAPIClient):
api_path = "documents"
resource = Document

def __init__(self, client):
super().__init__(client)
# Rate limit for public document asset fetches (S3-hosted).
# Private document assets go through the API client and are limited there.
# Token bucket: burst of 100, sustained at 15/min (0.25/sec).
storage = token_bucket.MemoryStorage()
self._asset_limiter = token_bucket.Limiter(
rate=15 / 60,
capacity=100,
storage=storage,
)
self._asset_session = requests_retry_session()

def asset_get(self, url, **kwargs):
if not self._asset_limiter.consume("asset"):
logger.warning("Rate limit reached for asset fetch, throttling...")
while not self._asset_limiter.consume("asset"):
time.sleep(0.1)
return self._asset_session.get(url, **kwargs)

def search(self, query, **params):
"""Return documents matching a search query"""

Expand Down
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

setup(
name="python-documentcloud",
version="4.6.0",
version="4.7.0",
description="A simple Python wrapper for the DocumentCloud API",
author="Mitchell Kotler",
author_email="mitch@muckrock.com",
Expand All @@ -27,6 +27,7 @@
"pyyaml",
"fastjsonschema",
"python-squarelet",
"token-bucket",
),
extras_require={
"dev": [
Expand All @@ -40,6 +41,7 @@
"test": [
"pytest",
"pytest-mock",
"pytest-xdist",
"pytest-recording",
"vcrpy",
],
Expand Down
75 changes: 75 additions & 0 deletions tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import ratelimit

# DocumentCloud
from documentcloud import DocumentCloud
from documentcloud.constants import RATE_LIMIT
from documentcloud.exceptions import APIError, CredentialsFailedError

Expand Down Expand Up @@ -111,3 +112,77 @@ def test_expired_refresh_token(short_client, record_mode):
assert short_client.users.get("me")
# check the refresh token was updated
assert old_refresh_token != short_client.refresh_token


def test_endpoint_rate_limit_burst_exhaustion():
"""Token bucket should block after burst capacity is exhausted"""
client = DocumentCloud()
# Exhaust the search burst (capacity=50)
_pattern_method, _pattern, limiter, bucket_key = client._endpoint_limiters[0]
for _ in range(50):
limiter.consume(bucket_key)
assert not limiter.consume(bucket_key)


def test_endpoint_rate_limit_method_specificity():
"""GET and POST to documents/ should use different limiters"""
client = DocumentCloud()
limiters = {(pm, p): lim for pm, p, lim, _ in client._endpoint_limiters}
assert limiters[("GET", "files/")] is not limiters[("POST", "documents/")]


def test_endpoint_rate_limit_pattern_ordering():
"""documents/search should match before documents/"""
client = DocumentCloud()
url = "documents/search/"
matched = next(
p for pm, p, _, _ in client._endpoint_limiters if pm == "GET" and p in url
)
assert matched == "documents/search"


def test_asset_rate_limit_burst_exhaustion():
"""Asset token bucket should block after burst capacity is exhausted"""
client = DocumentCloud()
limiter = client.documents._asset_limiter
for _ in range(100):
limiter.consume("asset")
assert not limiter.consume("asset")


def test_asset_rate_limit_refills():
"""Asset token bucket should refill over time"""
client = DocumentCloud()
limiter = client.documents._asset_limiter
for _ in range(100):
limiter.consume("asset")
assert not limiter.consume("asset")
time.sleep(5)
assert limiter.consume("asset")


def test_endpoint_rate_limit_buckets_are_independent():
"""Exhausting one endpoint's bucket should not affect another"""
client = DocumentCloud()
limiters = {(pm, p): (lim, bk) for pm, p, lim, bk in client._endpoint_limiters}
search_limiter, search_key = limiters[("GET", "documents/search")]
files_limiter, files_key = limiters[("GET", "files/")]

# Exhaust search bucket
for _ in range(50):
search_limiter.consume(search_key)
assert not search_limiter.consume(search_key)

# Files bucket should still have tokens
assert files_limiter.consume(files_key)


def test_endpoint_rate_limit_no_match_for_unrecognized_url():
"""Unrecognized URLs should not match any endpoint limiter"""
client = DocumentCloud()
url = "users/me/"
matched = next(
(p for pm, p, _, _ in client._endpoint_limiters if p in url),
None,
)
assert matched is None