Skip to main content

Analyzing Nginx Access Logs with Databend

Introduced: v1.1.55

Databend can be integrated as a sink for Vector, allowing for real-time gathering and analysis of Nginx access logs. This guide offers detailed, step-by-step instructions on how to utilize Databend and Vector for this purpose. For more information about the integration with Vector, see Integrating Databend as a Sink for Vector.

Step 1. Deploy Databend

1.1 Install Databend

Follow the Docker and Local Deployments guide to deploy a local Databend.

1.2 Create a Database and a Table

CREATE DATABASE nginx;
CREATE TABLE nginx.access_logs (
`timestamp` TIMESTAMP,
`remote_addr` VARCHAR,
`remote_port` INT,
`request_method` VARCHAR,
`request_uri` VARCHAR,
`server_protocol` VARCHAR,
`status` INT,
`bytes_sent` INT,
`http_referrer` VARCHAR,
`http_user_agent` VARCHAR,
`upstream_addr` VARCHAR,
`scheme` VARCHAR,
`gzip_ratio` VARCHAR,
`request_length` INT,
`request_time` FLOAT,
`ssl_protocol` VARCHAR,
`upstream_response_time` VARCHAR
);

1.3 Create a User for Vector Auth

Create a user:

CREATE USER user1 IDENTIFIED BY 'abc123';

Grant privileges for the user:

GRANT INSERT ON nginx.* TO user1;

Step 2. Deploy Nginx

2.1 Install Nginx

If you haven't install Nginx, please refer to How to Install Nginx.

2.2 Configure Nginx

nginx.conf
user www-data;
worker_processes 4;
pid /var/run/nginx.pid;

events {
worker_connections 768;
}

http {
##
# Logging Settings
##
log_format upstream '$remote_addr "$time_local" $host "$request_method $request_uri $server_protocol" $status $bytes_sent "$http_referrer" "$http_user_agent" $remote_port $upstream_addr $scheme $gzip_ratio $request_length $request_time $ssl_protocol "$upstream_response_time"';

access_log /var/log/nginx/access.log upstream;
error_log /var/log/nginx/error.log;

include /etc/nginx/conf.d/*.conf;
include /etc/nginx/sites-enabled/*;
}

This is how the log message looks:

::1 "09/Apr/2022:11:13:39 +0800" localhost "GET /?xx HTTP/1.1" 304 189 "-" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36" 50758 - http - 1202 0.000 - "-"

Use the new nginx.conf replace your Nginx configuration and restart the Nginx server.

Step 3. Deploy Vector

3.1 Install Vector

Your can install Vector with the installation script:

curl --proto '=https' --tlsv1.2 -sSf https://sh.vector.dev | bash

3.2 Configure Vector

vector.toml
[sources.nginx_access_log]
type = "file"
include = ["/var/log/nginx/access.log"]
file_key = "file"
max_read_bytes = 10240

[transforms.nginx_access_log_parser]
type = "remap"
inputs = ["nginx_access_log"]
drop_on_error = true

#nginx log_format upstream '$remote_addr "$time_local" $host "$request_method $request_uri $server_protocol" $status $bytes_sent "$http_referrer" "$http_user_agent" $remote_port $upstream_addr $scheme $gzip_ratio $request_length $request_time $ssl_protocol "$upstream_response_time"';

source = """
parsed_log, err = parse_regex(.message, r'^(?P<remote_addr>\\S+) \
\"(?P<time_local>\\S+ \\S+)\" \
(?P<host>\\S+) \
\"(?P<request_method>\\S+) (?P<request_uri>.+) (?P<server_protocol>HTTP/\\S+)\" \
(?P<status>\\d+) \
(?P<bytes_sent>\\d+) \
\"(?P<http_referrer>.*)\" \
\"(?P<http_user_agent>.*)\" \
(?P<remote_port>\\d+) \
(?P<upstream_addr>.+) \
(?P<scheme>\\S+) \
(?P<gzip_ratio>\\S+) \
(?P<request_length>\\d+) \
(?P<request_time>\\S+) \
(?P<ssl_protocol>\\S+) \
\"(?P<upstream_response_time>.+)\"$')
if err != null {
log("Unable to parse access log: " + string!(.message), level: "warn")
abort
}
. = merge(., parsed_log)
.timestamp = parse_timestamp!(.time_local, format: "%d/%b/%Y:%H:%M:%S %z")
.timestamp = format_timestamp!(.timestamp, format: "%F %X")

# Convert from string into integer.
.remote_port, err = to_int(.remote_port)
if err != null {
log("Unable to parse access log: " + string!(.remote_port), level: "warn")
abort
}

# Convert from string into integer.
.status, err = to_int(.status)
if err != null {
log("Unable to parse access log: " + string!(.status), level: "warn")
abort
}

# Convert from string into integer.
.bytes_sent, err = to_int(.bytes_sent)
if err != null {
log("Unable to parse access log: " + string!(.bytes_sent), level: "warn")
abort
}

# Convert from string into integer.
.request_length, err = to_int(.request_length)
if err != null {
log("Unable to parse access log: " + string!(.request_length), level: "warn")
abort
}

# Convert from string into float.
.request_time, err = to_float(.request_time)
if err != null {
log("Unable to parse access log: " + string!(.request_time), level: "warn")
abort
}
"""


[sinks.nginx_access_log_to_databend]
type = "databend"
inputs = ["nginx_access_log_parser"]
database = "nginx" #Your database
table = "access_logs" #Your table
endpoint = "http://localhost:8000/"
compression = "gzip"


[sinks.nginx_access_log_to_databend.auth]
strategy = "basic"
user = "user1" #Databend username
password = "abc123" #Databend password

[[tests]]
name = "extract fields from access log"

[[tests.inputs]]
insert_at = "nginx_access_log_parser"
type = "raw"
value = '::1 "09/Apr/2022:11:13:39 +0800" localhost "GET /?xx HTTP/1.1" 304 189 "-" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36" 50758 - http - 1202 0.000 - "-"'

[[tests.outputs]]
extract_from = "nginx_access_log_parser"

[[tests.outputs.conditions]]
type = "vrl"
source = """
assert_eq!(.remote_addr, "::1")
assert_eq!(.time_local, "09/Apr/2022:11:13:39 +0800")
assert_eq!(.timestamp, "2022-04-09 03:13:39")
assert_eq!(.request_method, "GET")
assert_eq!(.request_uri, "/?xx")
assert_eq!(.server_protocol, "HTTP/1.1")
assert_eq!(.status, 304)
assert_eq!(.bytes_sent, 189)
assert_eq!(.http_referrer, "-")
assert_eq!(.http_user_agent, "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36")
assert_eq!(.remote_port, 50758)
assert_eq!(.upstream_addr, "-")
assert_eq!(.scheme, "http")
assert_eq!(.gzip_ratio, "-")
assert_eq!(.request_length, 1202)
assert_eq!(.request_time, 0.000)
assert_eq!(.ssl_protocol, "-")
assert_eq!(.upstream_response_time, "-")
"""

[[tests]]
name = "no event from wrong access log"
no_outputs_from = ["nginx_access_log_parser"]

[[tests.inputs]]
insert_at = "nginx_access_log_parser"
type = "raw"
value = 'I am not access log'

3.3 Validate Configuration

Check the nginx_access_log_parser transform works or not:

vector test ./vector.toml

If it works, the output is:

Running tests
test extract fields from access log ... passed
2022-04-09T04:03:09.704557Z WARN transform{component_kind="transform" component_id=nginx_access_log_parser component_type=remap component_name=nginx_access_log_parser}: vrl_stdlib::log: "Unable to parse access log: I am not access log" internal_log_rate_secs=1 vrl_position=479
test no event from wrong access log ... passed

3.4 Run Vector

vector -c ./vector.toml

Step 4. Analyze Nginx Log in Databend

4.1 Generate logs

Reload the home page at http://localhost/xx/yy?mm=nn many times, or using the wrk HTTP benchmarking tool to generate a large amount Nginx logs quickly:

wrk -t12 -c400 -d30s http://localhost

4.2 Analyze Nginx Access Logs in Databend

  • Top 10 Request Status
SELECT count() AS count, status FROM nginx.access_logs GROUP BY status LIMIT 10;

+-----------+--------+
| count | status |
+-----------+--------+
| 106218701 | 404 |
+-----------+--------+
  • Top 10 Request Methods
SELECT count() AS count, request_method FROM nginx.access_logs GROUP BY request_method LIMIT 10;

+-----------+----------------+
| count | request_method |
+-----------+----------------+
| 106218701 | GET |
+-----------+----------------+
  • Top 10 Request IPs
SELECT count(*) AS count, remote_addr AS client FROM nginx.access_logs GROUP BY client ORDER BY count DESC LIMIT 10;

+----------+-----------+
| count | client |
+----------+-----------+
| 98231357 | 127.0.0.1 |
| 2 | ::1 |
+----------+-----------+
  • Top 10 Request Pages
SELECT count(*) AS count, request_uri AS uri FROM nginx.access_logs GROUP BY uri ORDER BY count DESC LIMIT 10;

+----------+--------------------+
| count | uri |
+----------+--------------------+
| 60645174 | /db/abc |
| 41727953 | /a/b/c/d/e/f/d |
| 199852 | /index.html |
| 2 | /xx/yy |
+----------+--------------------+
  • Top 10 HTTP 404 Pages
SELECT count_if(status=404) AS count, request_uri AS uri FROM nginx.access_logs GROUP BY uri ORDER BY count DESC LIMIT 10;

+----------+--------------------+
| count | uri |
+----------+--------------------+
| 64290894 | /db/abc |
| 41727953 | /a/b/c/d/e/f/d |
| 199852 | /index.html |
| 2 | /xx/yy |
+----------+--------------------+
  • Top 10 Requests
SELECT count(*) AS count, request_uri AS request FROM nginx.access_logs GROUP BY request ORDER BY count DESC LIMIT 10;

+--------+-----------------------------------------------------------------------------------------------------+
| count | request |
+--------+-----------------------------------------------------------------------------------------------------+
| 199852 | /index.html HTTP/1.0 |
| 1000 | /db/abc?good=iphone&uuid=9329836906 HTTP/1.1 |
| 900 | /miaosha/i/miaosha?goodsRandomName=0e67e331-c521-406a-b705-64e557c4c06c&mobile=17967444396 HTTP/1.1 |
| 900 | /miaosha/i/miaosha?goodsRandomName=0e67e331-c521-406a-b705-64e557c4c06c&mobile=16399821384 HTTP/1.1 |
| 900 | /miaosha/i/miaosha?goodsRandomName=0e67e331-c521-406a-b705-64e557c4c06c&mobile=17033481055 HTTP/1.1 |
| 900 | /miaosha/i/miaosha?goodsRandomName=0e67e331-c521-406a-b705-64e557c4c06c&mobile=17769945743 HTTP/1.1 |
| 900 | /miaosha/i/miaosha?goodsRandomName=0e67e331-c521-406a-b705-64e557c4c06c&mobile=15414263117 HTTP/1.1 |
| 900 | /miaosha/i/miaosha?goodsRandomName=0e67e331-c521-406a-b705-64e557c4c06c&mobile=18945218607 HTTP/1.1 |
| 900 | /miaosha/i/miaosha?goodsRandomName=0e67e331-c521-406a-b705-64e557c4c06c&mobile=19889051988 HTTP/1.1 |
| 900 | /miaosha/i/miaosha?goodsRandomName=0e67e331-c521-406a-b705-64e557c4c06c&mobile=15249667263 HTTP/1.1 |
+--------+-----------------------------------------------------------------------------------------------------+