search (13) - elastic4s histograms: aggregate histogram

In aggregate group statistics, we will face two types of group elements: continuous type such as time, natural number, discrete type such as location, product, etc. Discrete data itself represents different groups, but continuous data needs to be manually segmented at equal intervals. Here is an example of aggregation by price segment:

POST /cartxns/_search
{
  "size" : 1,
  "aggs": {
    "sales_per_pricerange": {
      "histogram": {
        "field": "price",
        "interval": 20000
      },
      "aggs": {
        "total sales": {
          "sum": {
            "field": "price"
          }
        }
      }
    }
  }
 }
}

In the example above, we split the price by 20000. Get the measure of the price segment 0-1999920000-3999940000-59999

  "aggregations" : {
    "sales_per_pricerange" : {
      "buckets" : [
        {
          "key" : 0.0,
          "doc_count" : 3,
          "total sales" : {
            "value" : 37000.0
          }
        },
        {
          "key" : 20000.0,
          "doc_count" : 4,
          "total sales" : {
            "value" : 95000.0
          }
        },
        {
          "key" : 40000.0,
          "doc_count" : 0,
          "total sales" : {
            "value" : 0.0
          }
        },
        {
          "key" : 60000.0,
          "doc_count" : 0,
          "total sales" : {
            "value" : 0.0
          }
        },
        {
          "key" : 80000.0,
          "doc_count" : 1,
          "total sales" : {
            "value" : 80000.0
          }
        }
      ]
    }
  }

In elastic4s, it is expressed as follows:

  val aggHist = search("cartxns").aggregations(
    histogramAggregation("sales_per_price")
      .field("price")
      .interval(20000).subAggregations(
      sumAggregation("total_sales").field("price")
    )
  )
  println(aggHist.show)

  val histResult = client.execute(aggHist).await

  if (histResult.isSuccess)
    histResult.result.aggregations.histogram("sales_per_price").buckets
        .foreach(hb => println(s"${hb.key},${hb.docCount}:${hb.sum("total_sales").value}"))
  else println(s"error: ${histResult.error.reason}")

....

POST:/cartxns/_search?
StringEntity({"aggs":{"sales_per_price":{"histogram":{"interval":20000.0,"field":"price"},"aggs":{"total_sales":{"sum":{"field":"price"}}}}}},Some(application/json))
0.0,3:37000.0
20000.0,4:95000.0
40000.0,0:0.0
60000.0,0:0.0
80000.0,1:80000.0

The following statistics by car type are the aggregation statistics of a discrete element:

POST /cartxns/_search
{
  "size" : 1,
  "aggs": {
    "avage price per model" : {
        "terms": {"field" : "make.keyword"},
        "aggs": {
          "average price": {
            "avg": {"field": "price"}
          },
          "max price" : {
            "max": {
              "field": "price"
            }
          },
          "min price" : {
            "min": {
              "field": "price"
            }
          }
          
        }
     }
  }
}

We can get the average price and the lowest and highest price of each car:

  "aggregations" : {
    "avage price per model" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "honda",
          "doc_count" : 3,
          "max price" : {
            "value" : 20000.0
          },
          "average price" : {
            "value" : 16666.666666666668
          },
          "min price" : {
            "value" : 10000.0
          }
        },
        {
          "key" : "ford",
          "doc_count" : 2,
          "max price" : {
            "value" : 30000.0
          },
          "average price" : {
            "value" : 27500.0
          },
          "min price" : {
            "value" : 25000.0
          }
        },
        {
          "key" : "toyota",
          "doc_count" : 2,
          "max price" : {
            "value" : 15000.0
          },
          "average price" : {
            "value" : 13500.0
          },
          "min price" : {
            "value" : 12000.0
          }
        },
        {
          "key" : "bmw",
          "doc_count" : 1,
          "max price" : {
            "value" : 80000.0
          },
          "average price" : {
            "value" : 80000.0
          },
          "min price" : {
            "value" : 80000.0
          }
        }
      ]
    }
  }

The demonstration of elastic4s is as follows:

  val aggDisc = search("cartxns").aggregations(
    termsAgg("prices_per_model","make.keyword").subAggregations(
      avgAgg("average_price","price"),
      minAgg("min_price","price"),
      maxAgg("max_price","price")
    )
  )
  println(aggDisc.show)
  val discResult = client.execute(aggDisc).await

  if (discResult.isSuccess)
    discResult.result.aggregations.terms("prices_per_model").buckets
      .foreach(mb =>
        println(s"${mb.key},${mb.docCount}:${mb.avg("average_price").value}," +
          s"${mb.min("min_price").value.getOrElse(0)}," +
          s"${mb.max("max_price").value.getOrElse(0)}"))
  else println(s"error: ${discResult.error.causedBy.getOrElse("unknown")}")

...

POST:/cartxns/_search?
StringEntity({"aggs":{"prices_per_model":{"terms":{"field":"make.keyword"},"aggs":{"average_price":{"avg":{"field":"price"}},"min_price":{"min":{"field":"price"}},"max_price":{"max":{"field":"price"}}}}}},Some(application/json))
honda,3:16666.666666666668,10000.0,20000.0
ford,2:27500.0,25000.0,30000.0
toyota,2:13500.0,12000.0,15000.0
bmw,1:80000.0,80000.0,80000.0

Date? Histogram is a statistical method of aggregation by time interval. It is very useful for data analysis based on time trend:

POST /cartxns/_search
{
   "aggs": {
     "sales_per_month": {
       "date_histogram": {
         "field": "sold",
         "calendar_interval":"1M",
         "format": "yyyy-MM-dd"
       }
     }
   }
}

...

  "aggregations" : {
    "sales_per_month" : {
      "buckets" : [
        {
          "key_as_string" : "2014-01-01",
          "key" : 1388534400000,
          "doc_count" : 1
        },
        {
          "key_as_string" : "2014-02-01",
          "key" : 1391212800000,
          "doc_count" : 1
        },
        {
          "key_as_string" : "2014-03-01",
          "key" : 1393632000000,
          "doc_count" : 0
        },
        {
          "key_as_string" : "2014-04-01",
          "key" : 1396310400000,
          "doc_count" : 0
        },
        {
          "key_as_string" : "2014-05-01",
          "key" : 1398902400000,
          "doc_count" : 1
        },
        {
          "key_as_string" : "2014-06-01",
          "key" : 1401580800000,
          "doc_count" : 0
        },
        {
          "key_as_string" : "2014-07-01",
          "key" : 1404172800000,
          "doc_count" : 1
        },
        {
          "key_as_string" : "2014-08-01",
          "key" : 1406851200000,
          "doc_count" : 1
        },
        {
          "key_as_string" : "2014-09-01",
          "key" : 1409529600000,
          "doc_count" : 0
        },
        {
          "key_as_string" : "2014-10-01",
          "key" : 1412121600000,
          "doc_count" : 1
        },
        {
          "key_as_string" : "2014-11-01",
          "key" : 1414800000000,
          "doc_count" : 2
        }
      ]
    }
  }

The above example generates a bucket with month as its unit. elastic4s demonstration:

  val aggDateHist = search("cartxns").aggregations(
    dateHistogramAggregation("sales_per_month")
      .field("sold")
      .calendarInterval(DateHistogramInterval.Month)
      .format("yyyy-MM-dd")
      .minDocCount(1)
  )
  println(aggDateHist.show)

  val dtHistResult = client.execute(aggDateHist).await

  if (dtHistResult.isSuccess)
    dtHistResult.result.aggregations.dateHistogram("sales_per_month").buckets
        .foreach(db => println(s"${db.date},${db.docCount}"))
  else println(s"error: ${dtHistResult.error.causedBy.getOrElse("unknown")}")

...

POST:/cartxns/_search?
StringEntity({"aggs":{"sales_per_month":{"date_histogram":{"calendar_interval":"1M","min_doc_count":1,"format":"yyyy-MM-dd","field":"sold"}}}},Some(application/json))
2014-01-01,1
2014-02-01,1
2014-05-01,1
2014-07-01,1
2014-08-01,1
2014-10-01,1
2014-11-01,2

After bucket is divided by month, deep aggregation can be performed every month:

POST /cartxns/_search
{
   "aggs": {
     "sales_per_month": {
       "date_histogram": {
         "field": "sold",
         "calendar_interval":"1M",
         "format": "yyyy-MM-dd"
       },
       "aggs": {
         "per_make_sum": {
           "terms": {
             "field": "make.keyword",
             "size": 10
           },
           "aggs": {
             "sum_price": {
               "sum": {"field": "price"}
             }
           }
         },
         "total_sum": {
           "sum": {
             "field": "price"
           }
         }
       }
     }
   }
}

We can get the total sales amount of each month and the sales amount of each car model, as follows:

"aggregations" : {
    "sales_per_month" : {
      "buckets" : [
        {
          "key_as_string" : "2014-01-01",
          "key" : 1388534400000,
          "doc_count" : 1,
          "per_make_sum" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [
              {
                "key" : "bmw",
                "doc_count" : 1,
                "sum_price" : {
                  "value" : 80000.0
                }
              }
            ]
          },
          "total_sum" : {
            "value" : 80000.0
          }
        },
        {
          "key_as_string" : "2014-02-01",
          "key" : 1391212800000,
          "doc_count" : 1,
          "per_make_sum" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [
              {
                "key" : "ford",
                "doc_count" : 1,
                "sum_price" : {
                  "value" : 25000.0
                }
              }
            ]
          },
          "total_sum" : {
            "value" : 25000.0
          }
        },
        {
          "key_as_string" : "2014-03-01",
          "key" : 1393632000000,
          "doc_count" : 0,
          "per_make_sum" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [ ]
          },
          "total_sum" : {
            "value" : 0.0
          }
        },
        {
          "key_as_string" : "2014-04-01",
          "key" : 1396310400000,
          "doc_count" : 0,
          "per_make_sum" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [ ]
          },
          "total_sum" : {
            "value" : 0.0
          }
        },
        {
          "key_as_string" : "2014-05-01",
          "key" : 1398902400000,
          "doc_count" : 1,
          "per_make_sum" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [
              {
                "key" : "ford",
                "doc_count" : 1,
                "sum_price" : {
                  "value" : 30000.0
                }
              }
            ]
          },
          "total_sum" : {
            "value" : 30000.0
          }
        },
        {
          "key_as_string" : "2014-06-01",
          "key" : 1401580800000,
          "doc_count" : 0,
          "per_make_sum" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [ ]
          },
          "total_sum" : {
            "value" : 0.0
          }
        },
        {
          "key_as_string" : "2014-07-01",
          "key" : 1404172800000,
          "doc_count" : 1,
          "per_make_sum" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [
              {
                "key" : "toyota",
                "doc_count" : 1,
                "sum_price" : {
                  "value" : 15000.0
                }
              }
            ]
          },
          "total_sum" : {
            "value" : 15000.0
          }
        },
        {
          "key_as_string" : "2014-08-01",
          "key" : 1406851200000,
          "doc_count" : 1,
          "per_make_sum" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [
              {
                "key" : "toyota",
                "doc_count" : 1,
                "sum_price" : {
                  "value" : 12000.0
                }
              }
            ]
          },
          "total_sum" : {
            "value" : 12000.0
          }
        },
        {
          "key_as_string" : "2014-09-01",
          "key" : 1409529600000,
          "doc_count" : 0,
          "per_make_sum" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [ ]
          },
          "total_sum" : {
            "value" : 0.0
          }
        },
        {
          "key_as_string" : "2014-10-01",
          "key" : 1412121600000,
          "doc_count" : 1,
          "per_make_sum" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [
              {
                "key" : "honda",
                "doc_count" : 1,
                "sum_price" : {
                  "value" : 10000.0
                }
              }
            ]
          },
          "total_sum" : {
            "value" : 10000.0
          }
        },
        {
          "key_as_string" : "2014-11-01",
          "key" : 1414800000000,
          "doc_count" : 2,
          "per_make_sum" : {
            "doc_count_error_upper_bound" : 0,
            "sum_other_doc_count" : 0,
            "buckets" : [
              {
                "key" : "honda",
                "doc_count" : 2,
                "sum_price" : {
                  "value" : 40000.0
                }
              }
            ]
          },
          "total_sum" : {
            "value" : 40000.0
          }
        }
      ]
    }
  }

With elastic4s, you can write as follows:

  val aggMonthSales= search("cartxns").aggregations(
    dateHistogramAggregation("sales_per_month")
      .field("sold")
      .calendarInterval(DateHistogramInterval.Month)
      .format("yyyy-MM-dd")
      .minDocCount(1).subAggregations(
        termsAgg("month_make","make.keyword").subAggregations(
        sumAggregation("month_total_per_make").field("price")
      ),
      sumAggregation("monthly_total").field("price")
     )
   )

  println(aggMonthSales.show)
  
  val monthSalesResult = client.execute(aggMonthSales).await

  if (monthSalesResult.isSuccess)
     monthSalesResult.result.aggregations.dateHistogram("sales_per_month").buckets
       .foreach { sb =>
       println(s"${sb.date},${sb.docCount},${sb.sum("monthly_total").value}")
       sb.terms("month_make").buckets
        .foreach(mb =>       
        println(s"${mb.key},${mb.docCount},${mb.sum("month_total_per_make").value}"))
     }
  else println(s"error: ${monthSalesResult.error.causedBy.getOrElse("unknown")}")


...

POST:/cartxns/_search?
StringEntity({"aggs":{"sales_per_month":{"date_histogram":{"calendar_interval":"1M","min_doc_count":1,"format":"yyyy-MM-dd","field":"sold"},"aggs":{"month_make":{"terms":{"field":"make.keyword"},"aggs":{"month_total_per_make":{"sum":{"field":"price"}}}},"monthly_total":{"sum":{"field":"price"}}}}}},Some(application/json))
2014-01-01,1,80000.0
bmw,1,80000.0
2014-02-01,1,25000.0
ford,1,25000.0
2014-05-01,1,30000.0
ford,1,30000.0
2014-07-01,1,15000.0
toyota,1,15000.0
2014-08-01,1,12000.0
toyota,1,12000.0
2014-10-01,1,10000.0
honda,1,10000.0
2014-11-01,2,40000.0
honda,2,40000.0

Tags: Scala JSON

Posted on Fri, 15 May 2020 10:15:27 -0400 by carrotcake1029