Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/lint-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:

- uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6
with:
go-version: "1.26.3"
go-version: "1.26.4"

- name: golangci-lint
uses: golangci/golangci-lint-action@82606bf257cbaff209d206a39f5134f0cfbfd2ee # v9
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/stress-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:

- uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6
with:
go-version: "1.26.3"
go-version: "1.26.4"

- name: Run stress tests
id: stress_test
Expand Down
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ services:
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.persistentStateFile: /tmp/state.json
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.enableStateReconciliation: "false"
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.enableGooglebotIPCheck: "true"
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.enableUptimeRobotBypass: "false"
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.periodSeconds: 30
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.failureThreshold: 3
networks:
Expand All @@ -90,7 +91,7 @@ services:
--providers.docker=true
--providers.docker.network=default
--experimental.plugins.captcha-protect.modulename=github.com/libops/captcha-protect
--experimental.plugins.captcha-protect.version=v1.12.5
--experimental.plugins.captcha-protect.version=v1.13.0
volumes:
- /var/run/docker.sock:/var/run/docker.sock:z
- /CHANGEME/TO/A/HOST/PATH/FOR/STATE/FILE:/tmp/state.json:rw
Expand Down Expand Up @@ -126,6 +127,7 @@ services:
| `ipDepth` | `int` | `0` | How deep past the last non-exempt IP to fetch the real IP from `ipForwardedHeader`. Default 0 returns the last IP in the forward header |
| `goodBots` | `[]string` (encouraged) | *see below* | List of second-level domains for bots that are never challenged or rate-limited. |
| `enableGooglebotIPCheck`| `string`. | `"false"` | Treat IPs coming from googlebot's known IP ranges as good bots |
| `enableUptimeRobotBypass` | `string` | `"false"` | When `"true"`, bypass challenges for IP ranges published by UptimeRobot. The ranges are refreshed every 24 hours. |
| `protectParameters` | `string` | `"false"` | Forces rate limiting even for good bots if URL parameters are present. Useful for protecting faceted search pages. |
| `protectFileExtensions` | `[]string` | `""` | Comma-separated file extensions to protect. By default, your protected routes only protect html files. This is to prevent files like CSS/JS/img from tripping the rate limit. |
| `protectHttpMethods` | `[]string` | `"GET,HEAD"` | Comma-separated list of HTTP methods to protect against |
Expand Down Expand Up @@ -167,11 +169,14 @@ A good default value for `goodBots` would be:

```
enableGooglebotIPCheck: "true"
enableUptimeRobotBypass: "true"
goodBots: apple.com,archive.org,duckduckgo.com,facebook.com,google.com,instagram.com,kagibot.org,linkedin.com,msn.com,openalex.org,twitter.com,x.com
```

Since google publishes their bot IPs, we can also leverage their API to let google crawl the site unchallenged based on client IP. This can be enabled with `enableGooglebotIPCheck: "true"`

UptimeRobot publishes its monitoring IP ranges at `https://api.uptimerobot.com/meta/ips`. Set `enableUptimeRobotBypass: "true"` to exempt those IPs; the list is fetched at startup and refreshed every 24 hours. The default is `"false"`.

**However** if you set the config parameter `protectParameters="true"`, even good bots won't be allowed to crawl protected routes if a URL parameter is on the request (e.g. `/foo?bar=baz`). This `protectParameters` feature is meant to help protect faceted search pages.


Expand Down
2 changes: 2 additions & 0 deletions ci/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ services:
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.protectParameters: "${PROTECT_PARAMETERS:-false}"
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.goodBots: ""
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.enableGooglebotIPCheck: "false"
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.enableUptimeRobotBypass: "false"
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.mode: "regex"
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.protectRoutes: "^/"
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.excludeRoutes: "\\/oai\\/request,\\/node\\/\\d+\\/(book-)?manifest"
Expand Down Expand Up @@ -55,6 +56,7 @@ services:
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.protectParameters: "${PROTECT_PARAMETERS:-false}"
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.goodBots: ""
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.enableGooglebotIPCheck: "false"
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.enableUptimeRobotBypass: "false"
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.mode: "regex"
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.protectRoutes: "^/"
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.excludeRoutes: "\\/oai\\/request,\\/node\\/\\d+\\/(book-)?manifest"
Expand Down
22 changes: 22 additions & 0 deletions ci_behavior_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,27 @@ func TestCILabelEquivalentGooglebotParameterBehavior(t *testing.T) {
assertRedirect(t, protectedParams, googleIP, "/?foo=bar", "/challenge?destination=%2F%3Ffoo%3Dbar")
}

func TestCILabelEquivalentUptimeRobotBypassBehavior(t *testing.T) {
uptimeRobotIP := "203.0.113.10"

bypass := newCILabelEquivalentMiddleware(t, nil)
bypass.uptimeRobotIPs = helper.NewUptimeRobotIPs()
bypass.uptimeRobotIPs.Update([]string{"203.0.113.10/32"}, discardLogger())
bypass.config.EnableUptimeRobotBypass = "true"

for i := uint(0); i < ciRateLimit+1; i++ {
assertNoRedirect(t, bypass, uptimeRobotIP, "/")
}

disabled := newCILabelEquivalentMiddleware(t, nil)
disabled.uptimeRobotIPs = helper.NewUptimeRobotIPs()
disabled.uptimeRobotIPs.Update([]string{"203.0.113.10/32"}, discardLogger())
for i := uint(0); i < ciRateLimit; i++ {
assertNoRedirect(t, disabled, uptimeRobotIP, "/")
}
assertRedirect(t, disabled, uptimeRobotIP, "/", "/challenge?destination=%2F")
}

func TestPersistentStateSharingWithSynctest(t *testing.T) {
synctest.Test(t, func(t *testing.T) {
stateFile := filepath.Join(t.TempDir(), "state.json")
Expand Down Expand Up @@ -133,6 +154,7 @@ func ciLabelEquivalentConfig() *Config {
config.ProtectParameters = "false"
config.GoodBots = []string{}
config.EnableGooglebotIPCheck = "false"
config.EnableUptimeRobotBypass = "false"
config.Mode = "regex"
config.ProtectRoutes = []string{"^/"}
config.ExcludeRoutes = []string{
Expand Down
21 changes: 18 additions & 3 deletions internal/helper/google.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,14 +78,19 @@ func (g *GooglebotIPs) Contains(ip net.IP) bool {
// FetchGooglebotIPs fetches the list of Googlebot IPs from Google's official endpoint,
// parses the JSON response, and returns a slice of CIDR strings.
func FetchGooglebotIPs(log *slog.Logger, httpClient *http.Client, url string) ([]string, error) {
return FetchGooglebotIPsContext(context.Background(), log, httpClient, url)
}

// FetchGooglebotIPsContext fetches Googlebot IPs and cancels the request with ctx.
func FetchGooglebotIPsContext(parent context.Context, log *slog.Logger, httpClient *http.Client, url string) ([]string, error) {
log.Debug("Fetching Googlebot IPs")

req, err := http.NewRequest(http.MethodGet, url, nil)
if err != nil {
return nil, fmt.Errorf("failed to create Googlebot IP request: %w", err)
}

ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
ctx, cancel := context.WithTimeout(parent, 30*time.Second)
defer cancel()
req = req.WithContext(ctx)

Expand Down Expand Up @@ -121,13 +126,18 @@ func FetchGooglebotIPs(log *slog.Logger, httpClient *http.Client, url string) ([
// FetchGoogleCrawlerIPs fetches crawler IP ranges from multiple Google-managed endpoints,
// then returns a canonical, unique list where broader prefixes replace narrower prefixes.
func FetchGoogleCrawlerIPs(log *slog.Logger, httpClient *http.Client, urls []string) ([]string, error) {
return FetchGoogleCrawlerIPsContext(context.Background(), log, httpClient, urls)
}

// FetchGoogleCrawlerIPsContext fetches all configured Google crawler ranges with cancellation.
func FetchGoogleCrawlerIPsContext(ctx context.Context, log *slog.Logger, httpClient *http.Client, urls []string) ([]string, error) {
if len(urls) == 0 {
return nil, nil
}

allCIDRs := make([]string, 0)
for _, url := range urls {
cidrs, err := FetchGooglebotIPs(log, httpClient, url)
cidrs, err := FetchGooglebotIPsContext(ctx, log, httpClient, url)
if err != nil {
return nil, err
}
Expand All @@ -140,7 +150,12 @@ func FetchGoogleCrawlerIPs(log *slog.Logger, httpClient *http.Client, urls []str
// RefreshGoogleCrawlerIPs fetches crawler IPs from all configured URLs and updates
// the provided GooglebotIPs set. Returns the number of CIDRs loaded.
func RefreshGoogleCrawlerIPs(log *slog.Logger, httpClient *http.Client, target *GooglebotIPs, urls []string) (int, error) {
cidrs, err := FetchGoogleCrawlerIPs(log, httpClient, urls)
return RefreshGoogleCrawlerIPsContext(context.Background(), log, httpClient, target, urls)
}

// RefreshGoogleCrawlerIPsContext refreshes the active crawler ranges with cancellation.
func RefreshGoogleCrawlerIPsContext(ctx context.Context, log *slog.Logger, httpClient *http.Client, target *GooglebotIPs, urls []string) (int, error) {
cidrs, err := FetchGoogleCrawlerIPsContext(ctx, log, httpClient, urls)
if err != nil {
return 0, err
}
Expand Down
23 changes: 23 additions & 0 deletions internal/helper/google_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package helper

import (
"context"
"log/slog"
"net"
"net/http"
Expand Down Expand Up @@ -41,6 +42,28 @@ func TestGooglebotIPs(t *testing.T) {
t.Error("Expected 2001:db8::1 not to be a Googlebot IP")
}
}

func TestFetchGooglebotIPsContextHonorsCancellation(t *testing.T) {
requestStarted := make(chan struct{})
server := httptest.NewServer(http.HandlerFunc(func(_ http.ResponseWriter, r *http.Request) {
close(requestStarted)
<-r.Context().Done()
}))
defer server.Close()

ctx, cancel := context.WithCancel(context.Background())
done := make(chan error, 1)
go func() {
_, err := FetchGooglebotIPsContext(ctx, slog.Default(), server.Client(), server.URL)
done <- err
}()
<-requestStarted
cancel()

if err := <-done; err == nil {
t.Fatal("expected canceled fetch to fail")
}
}
func TestFetchGooglebotIPs(t *testing.T) {
log := slog.New(slog.NewTextHandler(os.Stdout, nil))
// Mock server
Expand Down
96 changes: 96 additions & 0 deletions internal/helper/uptimerobot.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
package helper

import (
"context"
"encoding/json"
"fmt"
"io"
"log/slog"
"net"
"net/http"
"time"
)

const maxUptimeRobotIPResponseSize = 1 << 20

// UptimeRobotIPRangeURL is the official UptimeRobot checker range endpoint.
var UptimeRobotIPRangeURL = "https://api.uptimerobot.com/meta/ips"

// UptimeRobotIPs is a thread-safe set of UptimeRobot IP ranges.
type UptimeRobotIPs = GooglebotIPs

// NewUptimeRobotIPs creates an empty UptimeRobot IP range set.
func NewUptimeRobotIPs() *UptimeRobotIPs {
return NewGooglebotIPs()
}

type uptimeRobotIPsJSON struct {
Prefixes []struct {
IPv4Prefix string `json:"ip_prefix"`
IPv6Prefix string `json:"ipv6_prefix"`
} `json:"prefixes"`
}

// FetchUptimeRobotIPs fetches and validates UptimeRobot's published checker IP ranges.
func FetchUptimeRobotIPs(ctx context.Context, httpClient *http.Client, endpoint string) ([]string, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
if err != nil {
return nil, fmt.Errorf("failed to create UptimeRobot IP request: %w", err)
}

resp, err := httpClient.Do(req)
if err != nil {
return nil, fmt.Errorf("failed to fetch UptimeRobot IPs: %w", err)
}
defer resp.Body.Close()

if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("failed to fetch UptimeRobot IPs, status code: %d", resp.StatusCode)
}

body, err := io.ReadAll(io.LimitReader(resp.Body, maxUptimeRobotIPResponseSize+1))
if err != nil {
return nil, fmt.Errorf("failed to read UptimeRobot IPs: %w", err)
}
if len(body) > maxUptimeRobotIPResponseSize {
return nil, fmt.Errorf("UptimeRobot IP response exceeds %d bytes", maxUptimeRobotIPResponseSize)
}

var payload uptimeRobotIPsJSON
if err := json.Unmarshal(body, &payload); err != nil {
return nil, fmt.Errorf("failed to decode UptimeRobot IPs: %w", err)
}

cidrs := make([]string, 0, len(payload.Prefixes))
for _, prefix := range payload.Prefixes {
for _, cidr := range []string{prefix.IPv4Prefix, prefix.IPv6Prefix} {
if cidr == "" {
continue
}
if _, _, err := net.ParseCIDR(cidr); err != nil {
return nil, fmt.Errorf("invalid UptimeRobot CIDR %q: %w", cidr, err)
}
cidrs = append(cidrs, cidr)
}
}
if len(cidrs) == 0 {
return nil, fmt.Errorf("UptimeRobot IP response contained no ranges")
}

return cidrs, nil
}

// RefreshUptimeRobotIPs atomically replaces the active ranges after a successful fetch.
func RefreshUptimeRobotIPs(parent context.Context, log *slog.Logger, httpClient *http.Client, target *UptimeRobotIPs, endpoint string) (int, error) {
ctx, cancel := context.WithTimeout(parent, 30*time.Second)
defer cancel()

cidrs, err := FetchUptimeRobotIPs(ctx, httpClient, endpoint)
if err != nil {
return 0, err
}
cidrs = ReduceCIDRs(cidrs, log)
target.Update(cidrs, log)

return len(cidrs), nil
}
Loading