diff --git a/README.md b/README.md index 19018b7..eb6d13b 100644 --- a/README.md +++ b/README.md @@ -17,18 +17,16 @@ Available Commands: snapshot Checks the status of Elasticsearch snapshots Flags: - -H, --hostname string Hostname of the Elasticsearch instance (CHECK_ELASTICSEARCH_HOSTNAME) (default "localhost") - -p, --port int Port of the Elasticsearch instance (default 9200) - -U, --username string Username for HTTP Basic Authentication (CHECK_ELASTICSEARCH_USERNAME) - -P, --password string Password for HTTP Basic Authentication (CHECK_ELASTICSEARCH_PASSWORD) - -S, --tls Use a HTTPS connection - --insecure Skip the verification of the server's TLS certificate - --ca-file string Specify the CA File for TLS authentication (CHECK_ELASTICSEARCH_CA_FILE) - --cert-file string Specify the Certificate File for TLS authentication (CHECK_ELASTICSEARCH_CERT_FILE) - --key-file string Specify the Key File for TLS authentication (CHECK_ELASTICSEARCH_KEY_FILE) - -t, --timeout int Timeout in seconds for the CheckPlugin (default 30) - -h, --help help for check_elasticsearch - -v, --version version for check_elasticsearch + -H, --hostname stringArray URL of an Elasticsearch instance. Can be used multiple times. (default [http://localhost:9200]) + -U, --username string Username for HTTP Basic Authentication (CHECK_ELASTICSEARCH_USERNAME) + -P, --password string Password for HTTP Basic Authentication (CHECK_ELASTICSEARCH_PASSWORD) + --insecure Skip the verification of the server's TLS certificate + --ca-file string Specify the CA File for TLS authentication (CHECK_ELASTICSEARCH_CA_FILE) + --cert-file string Specify the Certificate File for TLS authentication (CHECK_ELASTICSEARCH_CERT_FILE) + --key-file string Specify the Key File for TLS authentication (CHECK_ELASTICSEARCH_KEY_FILE) + -t, --timeout int Timeout in seconds for the plugin (default 30) + -h, --help help for check_elasticsearch + -v, --version version for check_elasticsearch ``` The check plugin respects the environment variables `HTTP_PROXY`, `HTTPS_PROXY` and `NO_PROXY`. @@ -54,14 +52,22 @@ Examples: Elasticsearch cluster with green status (all nodes are running): ``` -$ check_elasticsearch health -U exampleuser -P examplepassword -S --insecure +$ check_elasticsearch health -U exampleuser -P examplepassword +[OK] - Cluster es-example-cluster is green | status=0 nodes=3 data_nodes=3 active_primary_shards=10 active_shards=20 +``` + +When you have multiple cluster nodes: + +``` +$ check_elasticsearch health -U exampleuser -P examplepassword \ +--hostname "https://node1:9200" --hostname "https://node2:9200" --hostname "https://node3:9200" [OK] - Cluster es-example-cluster is green | status=0 nodes=3 data_nodes=3 active_primary_shards=10 active_shards=20 ``` Elasticsearch cluster with yellow status (not all nodes are running): ``` -$ check_elasticsearch health -U exampleuser -P examplepassword -S --insecure +$ check_elasticsearch health -U exampleuser -P examplepassword [WARNING] - Cluster es-example-cluster is yellow | status=1 nodes=2 data_nodes=2 active_primary_shards=10 active_shards=13``` ``` diff --git a/cmd/config.go b/cmd/config.go index a6f143e..5491e54 100644 --- a/cmd/config.go +++ b/cmd/config.go @@ -7,7 +7,6 @@ import ( "net/url" "os" "reflect" - "strconv" "time" "github.com/NETWAYS/check_elasticsearch/internal/client" @@ -16,15 +15,13 @@ import ( ) type Config struct { + Hostname []string Bearer string // Currently unused in CLI - Hostname string `env:"CHECK_ELASTICSEARCH_HOSTNAME"` CAFile string `env:"CHECK_ELASTICSEARCH_CA_FILE"` CertFile string `env:"CHECK_ELASTICSEARCH_CERT_FILE"` KeyFile string `env:"CHECK_ELASTICSEARCH_KEY_FILE"` Username string `env:"CHECK_ELASTICSEARCH_USERNAME"` Password string `env:"CHECK_ELASTICSEARCH_PASSWORD"` - Port int - TLS bool Insecure bool } @@ -65,13 +62,16 @@ func loadFromEnv(config any) { var cliConfig Config func (c *Config) NewClient() *client.Client { - u := url.URL{ - Scheme: "http", - Host: c.Hostname + ":" + strconv.Itoa(c.Port), - } + urls := make([]*url.URL, 0, len(c.Hostname)) + + for _, host := range c.Hostname { + u, errParse := url.Parse(host) + + if errParse != nil { + check.ExitError(errParse) + } - if c.TLS { - u.Scheme = "https" + urls = append(urls, u) } // Create TLS configuration for default RoundTripper @@ -110,5 +110,5 @@ func (c *Config) NewClient() *client.Client { rt = checkhttpconfig.NewBasicAuthRoundTripper(c.Username, c.Password, rt) } - return client.NewClient(u.String(), rt) + return client.NewClient(urls, rt) } diff --git a/cmd/health.go b/cmd/health.go index 0972084..0063e33 100644 --- a/cmd/health.go +++ b/cmd/health.go @@ -15,8 +15,8 @@ The cluster health status is: green = OK yellow = WARNING red = CRITICAL`, - Example: " check_elasticsearch health --hostname \"127.0.0.1\" --port 9200 --username \"exampleUser\" " + - "--password \"examplePass\" --tls --insecure", + Example: " check_elasticsearch health --hostname \"https://localhost:9200\" --username \"exampleUser\" " + + "--password \"examplePass\" --insecure", Run: func(_ *cobra.Command, _ []string) { client := cliConfig.NewClient() @@ -26,6 +26,11 @@ The cluster health status is: } var rc int + // How we map cluster states: + // green = 0 + // yellow = 1 + // red = 2 + // unknown = 3 switch health.Status { case "green": rc = check.OK @@ -42,12 +47,7 @@ The cluster health status is: output = "Cluster " + health.ClusterName + " is " + health.Status } - // green = 0 - // yellow = 1 - // red = 2 - // unknown = 3 p := perfdata.PerfdataList{ - {Label: "status", Value: rc}, {Label: "nodes", Value: health.NumberOfNodes}, {Label: "data_nodes", Value: health.NumberOfDataNodes}, {Label: "active_primary_shards", Value: health.ActivePrimaryShards}, diff --git a/cmd/health_test.go b/cmd/health_test.go index 5214311..8f6c63d 100644 --- a/cmd/health_test.go +++ b/cmd/health_test.go @@ -3,7 +3,6 @@ package cmd import ( "net/http" "net/http/httptest" - "net/url" "os/exec" "strings" "testing" @@ -11,11 +10,11 @@ import ( func TestHealth_ConnectionRefused(t *testing.T) { - cmd := exec.Command("go", "run", "../main.go", "health", "--port", "9999") + cmd := exec.Command("go", "run", "../main.go", "health", "--hostname", "http://localhost:9999") out, _ := cmd.CombinedOutput() actual := string(out) - expected := "[UNKNOWN] - could not fetch cluster health: Get \"http://localhost:9999/_cluster/health\": dial" + expected := "[UNKNOWN] - could not fetch cluster health: no node reachable (*errors.errorString)" if !strings.Contains(actual, expected) { t.Error("\nActual: ", actual, "\nExpected: ", expected) @@ -48,7 +47,7 @@ func TestHealthCmd(t *testing.T) { w.Write([]byte(`The Authorization header wasn't set`)) })), args: []string{"run", "../main.go", "health", "--username", "username", "--password", "password"}, - expected: "[OK] - Cluster test is green | status=0 nodes=1 data_nodes=1 active_primary_shards=3 active_shards=3\n", + expected: "[OK] - Cluster test is green | nodes=1 data_nodes=1 active_primary_shards=3 active_shards=3\n", }, { name: "health-invalid", @@ -58,7 +57,7 @@ func TestHealthCmd(t *testing.T) { w.Write([]byte(`{}`)) })), args: []string{"run", "../main.go", "health"}, - expected: "[UNKNOWN] - Cluster status unknown | status=3 nodes=0 data_nodes=0 active_primary_shards=0 active_shards=0\nexit status 3\n", + expected: "[UNKNOWN] - Cluster status unknown | nodes=0 data_nodes=0 active_primary_shards=0 active_shards=0\nexit status 3\n", }, { name: "health-404", @@ -78,7 +77,7 @@ func TestHealthCmd(t *testing.T) { w.Write([]byte(`{"cluster_name":"test","status":"foobar","timed_out":false,"number_of_nodes":1,"number_of_data_nodes":1,"active_primary_shards":3}`)) })), args: []string{"run", "../main.go", "health"}, - expected: "[UNKNOWN] - Cluster test is foobar | status=3 nodes=1 data_nodes=1 active_primary_shards=3 active_shards=0\nexit status 3\n", + expected: "[UNKNOWN] - Cluster test is foobar | nodes=1 data_nodes=1 active_primary_shards=3 active_shards=0\nexit status 3\n", }, { name: "health-ok", @@ -88,7 +87,7 @@ func TestHealthCmd(t *testing.T) { w.Write([]byte(`{"cluster_name":"test","status":"green","timed_out":false,"number_of_nodes":1,"number_of_data_nodes":1,"active_primary_shards":3,"active_shards":3,"relocating_shards":0,"initializing_shards":0,"unassigned_shards":0,"delayed_unassigned_shards":0,"number_of_pending_tasks":0,"number_of_in_flight_fetch":0,"task_max_waiting_in_queue_millis":0,"active_shards_percent_as_number":100.0}`)) })), args: []string{"run", "../main.go", "health"}, - expected: "[OK] - Cluster test is green | status=0 nodes=1 data_nodes=1 active_primary_shards=3 active_shards=3\n", + expected: "[OK] - Cluster test is green | nodes=1 data_nodes=1 active_primary_shards=3 active_shards=3\n", }, { name: "health-yellow", @@ -98,7 +97,7 @@ func TestHealthCmd(t *testing.T) { w.Write([]byte(`{"cluster_name":"test","status":"yellow","timed_out":false,"number_of_nodes":1,"number_of_data_nodes":1,"active_primary_shards":3,"active_shards":3,"relocating_shards":0,"initializing_shards":0,"unassigned_shards":0,"delayed_unassigned_shards":0,"number_of_pending_tasks":0,"number_of_in_flight_fetch":0,"task_max_waiting_in_queue_millis":0,"active_shards_percent_as_number":100.0}`)) })), args: []string{"run", "../main.go", "health"}, - expected: "[WARNING] - Cluster test is yellow | status=1 nodes=1 data_nodes=1 active_primary_shards=3 active_shards=3\nexit status 1\n", + expected: "[WARNING] - Cluster test is yellow | nodes=1 data_nodes=1 active_primary_shards=3 active_shards=3\nexit status 1\n", }, { name: "health-red", @@ -108,7 +107,7 @@ func TestHealthCmd(t *testing.T) { w.Write([]byte(`{"cluster_name":"test","status":"red","timed_out":false,"number_of_nodes":1,"number_of_data_nodes":1,"active_primary_shards":3,"active_shards":3,"relocating_shards":0,"initializing_shards":0,"unassigned_shards":0,"delayed_unassigned_shards":0,"number_of_pending_tasks":0,"number_of_in_flight_fetch":0,"task_max_waiting_in_queue_millis":0,"active_shards_percent_as_number":100.0}`)) })), args: []string{"run", "../main.go", "health"}, - expected: "[CRITICAL] - Cluster test is red | status=2 nodes=1 data_nodes=1 active_primary_shards=3 active_shards=3\nexit status 2\n", + expected: "[CRITICAL] - Cluster test is red | nodes=1 data_nodes=1 active_primary_shards=3 active_shards=3\nexit status 2\n", }, } @@ -116,9 +115,7 @@ func TestHealthCmd(t *testing.T) { t.Run(test.name, func(t *testing.T) { defer test.server.Close() - // We need the random Port extracted - u, _ := url.Parse(test.server.URL) - cmd := exec.Command("go", append(test.args, "--port", u.Port())...) + cmd := exec.Command("go", append(test.args, "--hostname", test.server.URL)...) out, _ := cmd.CombinedOutput() actual := string(out) diff --git a/cmd/ingest_test.go b/cmd/ingest_test.go index 53c266c..31d7055 100644 --- a/cmd/ingest_test.go +++ b/cmd/ingest_test.go @@ -3,7 +3,6 @@ package cmd import ( "net/http" "net/http/httptest" - "net/url" "os/exec" "strings" "testing" @@ -11,11 +10,11 @@ import ( func TestIngest_ConnectionRefused(t *testing.T) { - cmd := exec.Command("go", "run", "../main.go", "ingest", "--port", "9999") + cmd := exec.Command("go", "run", "../main.go", "ingest", "--hostname", "http://localhost:9999") out, _ := cmd.CombinedOutput() actual := string(out) - expected := "[UNKNOWN] - could not fetch cluster nodes statistics: Get \"http://localhost:9999/_nodes/stats\": dial" + expected := "[UNKNOWN] - could not fetch cluster nodes statistics: no node reachable (*errors.errorString)" if !strings.Contains(actual, expected) { t.Error("\nActual: ", actual, "\nExpected: ", expected) @@ -82,9 +81,7 @@ func TestIngestCmd(t *testing.T) { t.Run(test.name, func(t *testing.T) { defer test.server.Close() - // We need the random Port extracted - u, _ := url.Parse(test.server.URL) - cmd := exec.Command("go", append(test.args, "--port", u.Port())...) + cmd := exec.Command("go", append(test.args, "--hostname", test.server.URL)...) out, _ := cmd.CombinedOutput() actual := string(out) diff --git a/cmd/query_test.go b/cmd/query_test.go index 8d6cf3a..8097900 100644 --- a/cmd/query_test.go +++ b/cmd/query_test.go @@ -3,7 +3,6 @@ package cmd import ( "net/http" "net/http/httptest" - "net/url" "os/exec" "strings" "testing" @@ -11,12 +10,12 @@ import ( func TestQuery_ConnectionRefused(t *testing.T) { - cmd := exec.Command("go", "run", "../main.go", "query", "--port", "9999") + cmd := exec.Command("go", "run", "../main.go", "query", "--hostname", "http://localhost:9999") out, _ := cmd.CombinedOutput() actual := string(out) - expected := "[UNKNOWN] - could not execute search request: Get \"http://localhost:9999/_all/_search?size=1&track_total_hits=true\": dial" + expected := "[UNKNOWN] - could not execute search request: no node reachable (*errors.errorString)" if !strings.Contains(actual, expected) { t.Error("\nActual: ", actual, "\nExpected: ", expected) @@ -137,9 +136,7 @@ exit status 1 t.Run(test.name, func(t *testing.T) { defer test.server.Close() - // We need the random Port extracted - u, _ := url.Parse(test.server.URL) - cmd := exec.Command("go", append(test.args, "--port", u.Port())...) + cmd := exec.Command("go", append(test.args, "--hostname", test.server.URL)...) out, _ := cmd.CombinedOutput() actual := string(out) diff --git a/cmd/root.go b/cmd/root.go index accd988..9f94033 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -41,16 +41,12 @@ func init() { }) pfs := rootCmd.PersistentFlags() - pfs.StringVarP(&cliConfig.Hostname, "hostname", "H", "localhost", - "Hostname of the Elasticsearch instance (CHECK_ELASTICSEARCH_HOSTNAME)") - pfs.IntVarP(&cliConfig.Port, "port", "p", 9200, - "Port of the Elasticsearch instance") + pfs.StringArrayVarP(&cliConfig.Hostname, "hostname", "H", []string{"http://localhost:9200"}, + "URL of an Elasticsearch instance. Can be used multiple times.") pfs.StringVarP(&cliConfig.Username, "username", "U", "", "Username for HTTP Basic Authentication (CHECK_ELASTICSEARCH_USERNAME)") pfs.StringVarP(&cliConfig.Password, "password", "P", "", "Password for HTTP Basic Authentication (CHECK_ELASTICSEARCH_PASSWORD)") - pfs.BoolVarP(&cliConfig.TLS, "tls", "S", false, - "Use a HTTPS connection") pfs.BoolVar(&cliConfig.Insecure, "insecure", false, "Skip the verification of the server's TLS certificate") pfs.StringVarP(&cliConfig.CAFile, "ca-file", "", "", @@ -60,7 +56,7 @@ func init() { pfs.StringVarP(&cliConfig.KeyFile, "key-file", "", "", "Specify the Key File for TLS authentication (CHECK_ELASTICSEARCH_KEY_FILE)") pfs.IntVarP(&timeout, "timeout", "t", timeout, - "Timeout in seconds for the CheckPlugin") + "Timeout in seconds for the plugin") rootCmd.Flags().SortFlags = false pfs.SortFlags = false diff --git a/cmd/snapshot_test.go b/cmd/snapshot_test.go index 9ac9ad1..e085595 100644 --- a/cmd/snapshot_test.go +++ b/cmd/snapshot_test.go @@ -3,7 +3,6 @@ package cmd import ( "net/http" "net/http/httptest" - "net/url" "os/exec" "strings" "testing" @@ -11,11 +10,11 @@ import ( func TestSnapshot_ConnectionRefused(t *testing.T) { - cmd := exec.Command("go", "run", "../main.go", "snapshot", "--port", "9999") + cmd := exec.Command("go", "run", "../main.go", "snapshot", "--hostname", "http://localhost:9999") out, _ := cmd.CombinedOutput() actual := string(out) - expected := "[UNKNOWN] - could not fetch snapshots: Get \"http://localhost:9999/_snapshot/*/*?order=desc\": dial" + expected := "[UNKNOWN] - could not fetch snapshots: no node reachable (*errors.errorString)" if !strings.Contains(actual, expected) { t.Error("\nActual: ", actual, "\nExpected: ", expected) @@ -154,9 +153,7 @@ func TestSnapshotCmd(t *testing.T) { t.Run(test.name, func(t *testing.T) { defer test.server.Close() - // We need the random Port extracted - u, _ := url.Parse(test.server.URL) - cmd := exec.Command("go", append(test.args, "--port", u.Port())...) + cmd := exec.Command("go", append(test.args, "--hostname", test.server.URL)...) out, _ := cmd.CombinedOutput() actual := string(out) diff --git a/internal/client/client.go b/internal/client/client.go index 2d5be08..4df4f3c 100644 --- a/internal/client/client.go +++ b/internal/client/client.go @@ -4,6 +4,7 @@ import ( "bytes" "context" "encoding/json" + "errors" "fmt" "net/http" "net/url" @@ -14,25 +15,50 @@ import ( type Client struct { Client http.Client - URL string + URLs []*url.URL } -func NewClient(url string, rt http.RoundTripper) *Client { +func NewClient(urls []*url.URL, rt http.RoundTripper) *Client { c := &http.Client{ Transport: rt, } return &Client{ - URL: url, + URLs: urls, Client: *c, } } +// Perform wraps the Client's HTTP call so that we can try all given +// nodes in case one node is not reachable +func (c *Client) Perform(req *http.Request) (*http.Response, error) { + originalPath := req.URL.String() + + for _, hostURL := range c.URLs { + // For each URL take the request, prepend the URL + u, _ := url.JoinPath(hostURL.String(), originalPath) + + req.URL, _ = url.Parse(u) + + resp, errDo := c.Client.Do(req) + + if errDo != nil { + // If there's an error we try the next host + continue + } + + return resp, errDo + } + + return &http.Response{}, errors.New("no node reachable") +} + +// Health retrieves the Cluster's health state func (c *Client) Health() (*es.HealthResponse, error) { ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() - u, _ := url.JoinPath(c.URL, "/_cluster/health") + u := "/_cluster/health" req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) @@ -42,10 +68,10 @@ func (c *Client) Health() (*es.HealthResponse, error) { return r, fmt.Errorf("error creating request: %w", err) } - resp, err := c.Client.Do(req) + resp, err := c.Perform(req) if err != nil { - return r, fmt.Errorf("could not fetch cluster health: %w", err) + return r, fmt.Errorf("could not fetch cluster health: %s", err.Error()) } if resp.StatusCode != http.StatusOK { @@ -85,7 +111,7 @@ func (c *Client) SearchMessages(index string, query string, messageKey string) ( return total, messages, fmt.Errorf("error encoding query: %w", err) } - u, _ := url.JoinPath(c.URL, index, "/_search") + u := index + "/_search" ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() @@ -104,10 +130,10 @@ func (c *Client) SearchMessages(index string, query string, messageKey string) ( req.URL.RawQuery = p.Encode() - resp, err := c.Client.Do(req) + resp, err := c.Perform(req) if err != nil { - return total, messages, fmt.Errorf("could not execute search request: %w", err) + return total, messages, fmt.Errorf("could not execute search request: %s", err.Error()) } var response es.SearchResponse @@ -144,11 +170,12 @@ func (c *Client) SearchMessages(index string, query string, messageKey string) ( return total, messages, nil } +// NodeStates retrieves the Cluster's node state func (c *Client) NodeStats() (*es.ClusterStats, error) { ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() - u, _ := url.JoinPath(c.URL, "/_nodes/stats") + u := "/_nodes/stats" req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) @@ -158,10 +185,10 @@ func (c *Client) NodeStats() (*es.ClusterStats, error) { return r, fmt.Errorf("error creating request: %w", err) } - resp, err := c.Client.Do(req) + resp, err := c.Perform(req) if err != nil { - return r, fmt.Errorf("could not fetch cluster nodes statistics: %w", err) + return r, fmt.Errorf("could not fetch cluster nodes statistics: %s", err.Error()) } if resp.StatusCode != http.StatusOK { @@ -178,13 +205,14 @@ func (c *Client) NodeStats() (*es.ClusterStats, error) { return r, nil } +// Snapshot retrieves the cluster's snapshot states func (c *Client) Snapshot(repository string, snapshot string) (*es.SnapshotResponse, error) { ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() r := &es.SnapshotResponse{} - u, _ := url.JoinPath(c.URL, "/_snapshot/", repository, snapshot) + u, _ := url.JoinPath("/_snapshot/", repository, snapshot) // Retrieve snapshots in descending order to get latest req, err := http.NewRequestWithContext(ctx, http.MethodGet, u+"?order=desc", nil) @@ -193,10 +221,10 @@ func (c *Client) Snapshot(repository string, snapshot string) (*es.SnapshotRespo return r, fmt.Errorf("error creating request: %w", err) } - resp, err := c.Client.Do(req) + resp, err := c.Perform(req) if err != nil { - return r, fmt.Errorf("could not fetch snapshots: %w", err) + return r, fmt.Errorf("could not fetch snapshots: %s", err.Error()) } if resp.StatusCode != http.StatusOK { diff --git a/internal/elasticsearch/api.go b/internal/elasticsearch/api.go index 042336e..2ef2b84 100644 --- a/internal/elasticsearch/api.go +++ b/internal/elasticsearch/api.go @@ -36,6 +36,7 @@ type ErrorRootCause struct { Reason string `json:"reason"` } +// GetErrors returns the error reasons when they are present in the response func (r *SearchResponse) GetErrors() string { if len(r.Error.RootCause) == 0 { return ""