Press "Enter" to skip to content

ElasticSearch DSL聚合查询语句

本来像聚合(aggregation)这种东西, 在Grafana中可以轻易的实现, 但是偶尔会有需求, 需要自己写DSL脚本实现一些功能, 于是, 只好自己动手了.

例子1

查询serverName=”dns-server-1″结果里, 按hostip的数量进行排序, 取前5

GET /my-service-2020.07.22/_search
{
  "query": {
    "term": { "serverName.keyword": "dns-server-1" }
  },
  "size" : 0,
  "aggs": {
    "top-10-hostip": {
      "terms": {
      	"field": "hostip.keyword",
        "size": 5
      }
    }
  }
}

结果

{
  "took" : 17494,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "aggregations" : {
    "top-10-hostip" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 97386291,
      "buckets" : [
        {
          "key" : "192.168.11.38",
          "doc_count" : 44581004
        },
        {
          "key" : "192.168.20.5",
          "doc_count" : 3772738
        },
        {
          "key" : "NXDOMAIN",
          "doc_count" : 1982672
        },
        {
          "key" : "103.15.99.87",
          "doc_count" : 902146
        },
        {
          "key" : "103.15.99.85",
          "doc_count" : 902014
        }
      ]
    }
  }
}

python写法:

from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, A

es = Elasticsearch(['172.29.4.158', '172.29.4.157', '172.29.4.156'],http_auth=('elastic', 'MyPassword'), timeout=60)
index = 'my-service-2020.07.22'
s = Search(using=es, index=index)
s = s.filter('term', **{"serverName.keyword": "dns-sever-1"}).extra(size=0)
a = A('terms', field="hostip.keyword", size=5)
s.aggs.bucket("my_aggr", a)
response = s.execute().to_dict()
print(response['aggregations']["my_aggr"]['buckets'])

例子2

查询serverName=”dns-server-1″结果里, 将每小时的数据量取出来, 形成一个报表(按数量排序), 并找出数量最高的1个小时

POST /my-service-2020.07.23/_search
{
  "query": {
    "bool": {
      "must": [
        { "match": { "serverName.keyword": "dns-server-1" }},
        { "match": { "type.keyword": "query[A]" }}
      ]
    }
  },
  "size": 0,
  "aggs": {
    "reply_per_hour": {
      "date_histogram": {
        "field": "@timestamp",
        "calendar_interval": "1h",
        "time_zone": "Asia/Shanghai",
        "min_doc_count": 1,
        // 这里按_count排个序
        "order": {
          "_count": "desc"
        }
      }
    },
    "max_count_hour": {
        "max_bucket": {
          "buckets_path": "reply_per_hour._count"
        }
    }
  }
}

结果如下:

{
  "took" : 55364,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 10000,
      "relation" : "gte"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "aggregations" : {
    "reply_per_hour" : {
      "buckets" : [
        {
          "key_as_string" : "2020-09-21T18:00:00.000+08:00",
          "key" : 1600682400000,
          "doc_count" : 13347997
        },
        {
          "key_as_string" : "2020-09-21T12:00:00.000+08:00",
          "key" : 1600660800000,
          "doc_count" : 13274115
        },
        {
          "key_as_string" : "2020-09-21T17:00:00.000+08:00",
          "key" : 1600678800000,
          "doc_count" : 13236271
        },
        {
          "key_as_string" : "2020-09-21T16:00:00.000+08:00",
          "key" : 1600675200000,
          "doc_count" : 13077092
        },
        {
          "key_as_string" : "2020-09-21T15:00:00.000+08:00",
          "key" : 1600671600000,
          "doc_count" : 12918659
        },
        {
          "key_as_string" : "2020-09-21T14:00:00.000+08:00",
          "key" : 1600668000000,
          "doc_count" : 12883054
        },
        {
          "key_as_string" : "2020-09-21T11:00:00.000+08:00",
          "key" : 1600657200000,
          "doc_count" : 12872996
        },
        {
          "key_as_string" : "2020-09-21T10:00:00.000+08:00",
          "key" : 1600653600000,
          "doc_count" : 12733672
        },
        {
          "key_as_string" : "2020-09-21T13:00:00.000+08:00",
          "key" : 1600664400000,
          "doc_count" : 12173536
        },
        {
          "key_as_string" : "2020-09-21T19:00:00.000+08:00",
          "key" : 1600686000000,
          "doc_count" : 12097663
        },
        {
          "key_as_string" : "2020-09-21T20:00:00.000+08:00",
          "key" : 1600689600000,
          "doc_count" : 10507819
        },
        {
          "key_as_string" : "2020-09-21T21:00:00.000+08:00",
          "key" : 1600693200000,
          "doc_count" : 7951246
        },
        {
          "key_as_string" : "2020-09-21T09:00:00.000+08:00",
          "key" : 1600650000000,
          "doc_count" : 7725856
        },
        {
          "key_as_string" : "2020-09-21T22:00:00.000+08:00",
          "key" : 1600696800000,
          "doc_count" : 5129891
        },
        {
          "key_as_string" : "2020-09-21T23:00:00.000+08:00",
          "key" : 1600700400000,
          "doc_count" : 3346232
        },
        {
          "key_as_string" : "2020-09-22T00:00:00.000+08:00",
          "key" : 1600704000000,
          "doc_count" : 2852798
        },
        {
          "key_as_string" : "2020-09-21T08:00:00.000+08:00",
          "key" : 1600646400000,
          "doc_count" : 2373603
        },
        {
          "key_as_string" : "2020-09-22T01:00:00.000+08:00",
          "key" : 1600707600000,
          "doc_count" : 2357723
        },
        {
          "key_as_string" : "2020-09-22T02:00:00.000+08:00",
          "key" : 1600711200000,
          "doc_count" : 2194866
        },
        {
          "key_as_string" : "2020-09-22T03:00:00.000+08:00",
          "key" : 1600714800000,
          "doc_count" : 2113059
        },
        {
          "key_as_string" : "2020-09-22T07:00:00.000+08:00",
          "key" : 1600729200000,
          "doc_count" : 2081641
        },
        {
          "key_as_string" : "2020-09-22T04:00:00.000+08:00",
          "key" : 1600718400000,
          "doc_count" : 2022224
        },
        {
          "key_as_string" : "2020-09-22T06:00:00.000+08:00",
          "key" : 1600725600000,
          "doc_count" : 1996344
        },
        {
          "key_as_string" : "2020-09-22T05:00:00.000+08:00",
          "key" : 1600722000000,
          "doc_count" : 1938734
        }
      ]
    },
    "max_count_hour" : {
      "value" : 1.3347997E7,
      "keys" : [
        "2020-09-21T18:00:00.000+08:00"
      ]
    }
  }
}

提示, 最后max_count_hour.value之所以显示成了1.3347997E7, 是因为这个数字超过了javascript的最大数值.

python写法

这里多加了几个查询条件(serverName=”dns-server-1″, hostip.keyword=”192.168.11.12″和type.keyword=”reply”)

from elasticsearch import Elasticsearch 
from elasticsearch_dsl import Search, A 

es = Elasticsearch(['172.29.4.158', '172.29.4.157', '172.29.4.156'],http_auth=('elastic', 'MyPassword'), timeout=60) index = 'my-service-2020.07.22'
s = Search(using=es, index=index)
s = s.filter('term', **{"serverName.keyword": "dns-sever-1"}).filter('term', **{"hostip.keyword": "192.168.11.12"}).filter('match', type__keyword="reply" ).extra(size=0)
a = A('date_histogram', field="@timestamp", time_zone="Asia/Shanghai", min_doc_count=100, calendar_interval="1s")
s.aggs.bucket("my_aggr", a)
response = s.execute().to_dict()
print(response['aggregations']["my_aggr"]['buckets'])

参考文档:
Elasticsearch DSL
Max Bucket Aggregation
elasticsearch-dsl で集計を伴う検索クエリを書く

Leave a Reply

Your email address will not be published. Required fields are marked *