diff --git a/.github/workflows/publish-grafana-image.yaml b/.github/workflows/publish-grafana-image.yaml new file mode 100644 index 0000000..a957d0a --- /dev/null +++ b/.github/workflows/publish-grafana-image.yaml @@ -0,0 +1,49 @@ +name: Create and publish SSO Grafana + +on: + push: + branches: + - main + - dev + paths: + - grafana-config/* + - .github/workflows/publish-grafana-image.yaml + +env: + GITHUB_REGISTRY: ghcr.io + IMAGE_NAME: bcgov/sso-loki + +jobs: + build-and-push-image: + runs-on: ubuntu-22.04 + permissions: + contents: read + packages: write + + steps: + - uses: hmarr/debug-action@v3 + - uses: actions/checkout@v4 + + - name: Log in to the GitHub Container registry + uses: docker/login-action@v3 + with: + registry: ${{ env.GITHUB_REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.GITHUB_REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + type=ref,event=branch + type=sha,format=long + + - name: Build and push Docker image + uses: docker/build-push-action@v5 + with: + context: grafana-config + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} diff --git a/.github/workflows/terraform.yaml b/.github/workflows/terraform.yaml new file mode 100644 index 0000000..00c321a --- /dev/null +++ b/.github/workflows/terraform.yaml @@ -0,0 +1,94 @@ +name: Terraform + +on: + workflow_dispatch: + push: + branches: + - dev + - main + paths: + - terraform-ecs/** + - loki-authorizer/** + - .github/** + +env: + TF_VERSION: 1.9.7 + +jobs: + terraform: + permissions: write-all + runs-on: ubuntu-20.04 + steps: + - uses: hmarr/debug-action@v3 + - uses: actions/checkout@v4 + + - name: Install asdf + uses: asdf-vm/actions/setup@v3 + + - name: Cache tools + uses: actions/cache@v4 + with: + path: /home/runner/.asdf + key: ${{ runner.os }}-${{ hashFiles('**/.tool-versions') }} + + - name: Install required tools + run: | + cat .tool-versions | cut -f 1 -d ' ' | xargs -n 1 asdf plugin-add || true + asdf plugin-update --all + asdf install + asdf reshim + shell: bash + + - name: Set env to development + if: (github.ref == 'refs/heads/dev' && github.event_name == 'push') + run: | + cat >> $GITHUB_ENV <> $GITHUB_ENV < backend.hcl + bucket = "${{ env.S3_BACKEND_NAME }}" + key = "tf-state" + region = "ca-central-1" + EOF + terraform init -backend-config=backend.hcl + + - name: Terraform Plan + run: terraform plan -var "auth_secret=${{env.LOKI_AUTH_TOKEN}}" -var "bucket_name=${{ env.LOKI_BUCKET_NAME }}" -var "loki_tag=${{env.LOKI_TAG}}" -no-color + working-directory: ./terraform-ecs + + - name: Terraform Apply + run: terraform apply -var "auth_secret=${{env.LOKI_AUTH_TOKEN}}" -var "bucket_name=${{ env.LOKI_BUCKET_NAME }}" -var "loki_tag=${{env.LOKI_TAG}}" -auto-approve + working-directory: ./terraform-ecs diff --git a/.gitignore b/.gitignore index e554ee4..062c7d0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ helm/**/charts get-pip.py **/.env +node_modules +.terraform +*.tfstate +*.tfstate.backup +loki-authorizer.zip diff --git a/.tool-versions b/.tool-versions index 7e39095..337957b 100644 --- a/.tool-versions +++ b/.tool-versions @@ -4,3 +4,4 @@ postgres 14.1 golang 1.21.0 helm 3.10.2 loki-logcli 2.9.2 +terraform 1.9.7 diff --git a/README.md b/README.md index 2265768..45fd69e 100644 --- a/README.md +++ b/README.md @@ -31,19 +31,34 @@ SSO Keycloak dashboard services provide the ability to monitor real-time statist 1. `Promtail` & `Loki`: collect, transform and load raw log data for the designated time period. -1. `Loki` & `MinIO`: provide the Amazon S3 compatible Object Storage to store/read compacted event data by Loki. +1. `Loki` & `S3`: provide the Amazon S3 compatible Object Storage to store/read compacted event data by Loki. 1. `Promtail` & `Custom Go server`: collect, and upsert the aggreated event historial data in DB. 1. `Grafana`: connect Loki and the aggregation DB to visualize the logs and stats. - ![SSO Dashboard Architecture Diagram](assets/sso-dashboard-arch.gif) + ![SSO Dashboard Architecture Diagram](assets/sso-dashboard.drawio.svg) + +1. Loki in AWS breakdown: + + ![SSO Loki on AWS Diagram](assets/sso-dashboard-aws.drawio.svg) + +### Loki in AWS ECS Cluster + +Loki has a helm chart for deploying in kubernetes. For the deployment in an ECS cluster there are a few changes to note: + +- Service discovery can be used in ECS to replace services in k8s. Since we cannot use this in the BCGov AWS, it has been replaced with a network load balancer. This is necessary to allow read and write tasks to communicate on port 7946. If not working, you will see "empty ring" errors. +- ECS does not support config maps. To replace this a custom image was built with custom configuration files. Configurations that will be changed at runtime can set their values with the syntax ${ENV_VAR:-default}, and environment variables can be used to configure them. Values consistent across environments can be hardcoded. +- The helm chart includes a deployment "gateway". This is an nginx reverse proxy which provides path-based routing to the read and write services. It has been replaced with listener rules on the application load balancer. ## Deployment -It continuously deploys the resources in the sandbox and the prod environment based on the repository branch (pr's to dev deploys sandbox, pr's to main deploys prod) that has the new changes. +The helm charts for the promtail instances and grafana dashboard can be installed with make commands. These automate adding environment variables from .env files in their directories. See the directory readmes for more information. + +The Loki setup is deployed with terraform into AWS. It deploys automatically on merge to dev/main. + GitHub CD pipeline scripts are triggered based on the directory that has changed; there is a recommended deployment order when deploying the resources for the very first time: 1. `Loki`: deploys the `MinIO` and `Loki` resources, `read`, `write`, and `gateway`. @@ -64,8 +79,6 @@ The following secrets are set in the GitHub secrets of the repository and can be - `SANDBOX_SSO_CLIENT_ID`: the SSO integration credentials, `client id`, to set in `Grafana` and `MinIO` dashboard UI. - `SANDBOX_SSO_CLIENT_SECRET`: the SSO integration credentials, `client secret`, to set in `Grafana` and `MinIO` dashboard UI. - please find the integration `#4492 SSO Dashboard` via [CSS app](https://bcgov.github.io/sso-requests) -- `SANDBOX_MINIO_USER`: the username of the initial MinIO admin account. -- `SANDBOX_MINIO_PASS`: the password of the initial MinIO admin account. ### Production @@ -76,5 +89,3 @@ The following secrets are set in the GitHub secrets of the repository and can be - `PROD_SSO_CLIENT_ID`: the SSO integration credentials, `client id`, to set in `Grafana` and `MinIO` dashboard UI. - `PROD_SSO_CLIENT_SECRET`: the SSO integration credentials, `client secret`, to set in `Grafana` and `MinIO` dashboard UI. - please find the integration `#4492 SSO Dashboard` via [CSS app](https://bcgov.github.io/sso-requests) -- `PROD_MINIO_USER`: the username of the initial MinIO admin account. -- `PROD_MINIO_PASS`: the password of the initial MinIO admin account. diff --git a/assets/sso-dashboard-aws.drawio.svg b/assets/sso-dashboard-aws.drawio.svg new file mode 100644 index 0000000..6d3202d --- /dev/null +++ b/assets/sso-dashboard-aws.drawio.svg @@ -0,0 +1,4 @@ + + + +
ECS Cluster
Loki Write Task
Network Load Balancer
Loki Read Task
Autoscaler
Lambda Authorizer Function
Application Load Balancer
API Gateway
Authorized Traffic
Path based routing
Path Based Routing
Amazon S3
S3 Bucket
External Traffic
diff --git a/assets/sso-dashboard-aws.drawio.xml b/assets/sso-dashboard-aws.drawio.xml new file mode 100644 index 0000000..7e50898 --- /dev/null +++ b/assets/sso-dashboard-aws.drawio.xml @@ -0,0 +1,179 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/sso-dashboard.drawio b/assets/sso-dashboard.drawio index 4be409a..9285e28 100644 --- a/assets/sso-dashboard.drawio +++ b/assets/sso-dashboard.drawio @@ -1,283 +1,370 @@ - - - + + + - - + + + + + - - + + + + + - - + + - - + + - - + + - - - - - - - + + - + + + + - - - - - - - + + - - - - - - - - - + + - - - - - - - - - + + - - + + - - - - + + - - + + - - + + + + + + + + + + + - - + + - + + + + + + + + + + + + + - + - - + + - + - + - - + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + - + + - - + + - - + + + + + + + + - + - + - - + + - - - - + + - - + + - - + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + - - + + - - + + - - - - + + - - + + - + - + - - + + - + - + - - + + - - - - - + + - - + + - - - - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - + + + - + + - - + + + + + + + + - + - + + + + + - - - - - - - - - - - + + + + + + + + - - + + - + - - + + + - - + + - - + + - + + + - - + + - - - - - - - - + + - - + + + + + + + diff --git a/assets/sso-dashboard.drawio.svg b/assets/sso-dashboard.drawio.svg new file mode 100644 index 0000000..060af42 --- /dev/null +++ b/assets/sso-dashboard.drawio.svg @@ -0,0 +1,4 @@ + + + +
Tools
SSO DASHBOARD
Aggregator
Amazon S3
Disk logs
Keycloak DEV
Promtail Aggregator
Promtail Loki
Dev
Disk logs
Keycloak TEST
Promtail Aggregator
Promtail Loki
Test
Disk logs
Keycloak PROD
Promtail Aggregator
Promtail Loki
Prod
AWS
diff --git a/grafana-config/Dockerfile b/grafana-config/Dockerfile new file mode 100644 index 0000000..8abd32e --- /dev/null +++ b/grafana-config/Dockerfile @@ -0,0 +1,16 @@ +# Use the official Loki base image +FROM docker.io/grafana/loki:2.9.2 + +# Set the working directory to /etc/loki +WORKDIR /etc/loki + +# Copy your custom config file into the container +COPY ./config.yaml /etc/loki/config/ +COPY ./runtime-config.yaml /etc/loki/runtime-config/runtime-config.yaml + +# Setup loki user permissions for working directory +USER root +RUN mkdir -p /var/loki/retention && chown -R loki:loki /var/loki +USER loki + +ENTRYPOINT ["/usr/bin/loki"] diff --git a/grafana-config/config.yaml b/grafana-config/config.yaml new file mode 100644 index 0000000..6bba179 --- /dev/null +++ b/grafana-config/config.yaml @@ -0,0 +1,92 @@ +auth_enabled: false +common: + path_prefix: /var/loki + replication_factor: 3 + storage: + s3: + bucketnames: ${S3_BUCKETNAME:-sso-loki} + region: ${S3_REGION:-ca-central-1} + endpoint: ${S3_ENDPOINT:-s3.ca-central-1.amazonaws.com} + s3forcepathstyle: true + insecure: true + ring: + kvstore: + store: memberlist +compactor: + compaction_interval: 10m + retention_delete_delay: 1h + retention_delete_worker_count: 150 + retention_enabled: true + shared_store: s3 + working_directory: /var/loki/retention +index_gateway: + mode: ring +ingester: + chunk_encoding: snappy + chunk_idle_period: 2h + chunk_target_size: 5242880 + max_chunk_age: 2h + wal: + enabled: true + flush_on_shutdown: true +limits_config: + enforce_metric_name: false + ingestion_rate_mb: 10 + max_cache_freshness_per_query: 1m + max_entries_limit_per_query: 100000 + max_global_streams_per_user: 1000000 + max_query_length: 721h + max_query_parallelism: 32 + max_query_series: 3000 + query_timeout: 500s + reject_old_samples: true + reject_old_samples_max_age: 168h + retention_period: 168h + split_queries_by_interval: 30m +memberlist: + join_members: + - ${JOIN_MEMBERS:-someurl} + bind_port: 7946 + bind_addr: [0.0.0.0] + gossip_interval: 5s + advertise_port: 7946 + +query_range: + align_queries_with_step: true +ruler: + storage: + s3: + bucketnames: ${S3_BUCKETNAME} + type: s3 +runtime_config: + file: /etc/loki/runtime-config/runtime-config.yaml +schema_config: + configs: + - from: '2022-12-21' + index: + period: 24h + prefix: index_ + object_store: s3 + schema: v11 + store: boltdb-shipper +server: + http_listen_address: 0.0.0.0 + grpc_listen_address: 0.0.0.0 + http_listen_port: 3100 + grpc_listen_port: 9095 + grpc_server_max_recv_msg_size: 26214400 + grpc_server_max_send_msg_size: 26214400 + http_server_idle_timeout: 500s + http_server_read_timeout: 500s + http_server_write_timeout: 500s + log_level: debug +storage_config: + boltdb_shipper: + active_index_directory: /var/loki/active + cache_location: /var/loki/cache + cache_ttl: 24h + shared_store: s3 + hedging: + at: 250ms + max_per_second: 20 + up_to: 3 diff --git a/grafana-config/runtime-config.yaml b/grafana-config/runtime-config.yaml new file mode 100644 index 0000000..e69de29 diff --git a/helm/grafana/.env.example b/helm/grafana/.env.example index 804945b..fdb3721 100644 --- a/helm/grafana/.env.example +++ b/helm/grafana/.env.example @@ -25,6 +25,8 @@ DS_AGGREGATOR_USERNAME= DS_AGGREGATOR_PASSWORD= DS_AGGREGATOR_DATABASE= +LOKI_AUTH_TOKEN= +API_GATEWAY_URL= # # production creds # NAME=sso-grafana # NAMESPACE=eb75ad-tools diff --git a/helm/grafana/Makefile b/helm/grafana/Makefile index 5e68ab7..77dcbb3 100644 --- a/helm/grafana/Makefile +++ b/helm/grafana/Makefile @@ -22,6 +22,8 @@ define arguments --set grafana.datasources."datasources\.yaml".datasources[3].user=${DS_KEYCLOAK_PROD_USERNAME} \ --set grafana.datasources."datasources\.yaml".datasources[3].database=${DS_KEYCLOAK_PROD_DATABASE} \ --set grafana.datasources."datasources\.yaml".datasources[3].secureJsonData.password=${DS_KEYCLOAK_PROD_PASSWORD} \ + --set grafana.datasources."datasources\.yaml".datasources[4].secureJsonData.httpHeaderValue1="Bearer ${LOKI_AUTH_TOKEN}" \ + --set grafana.datasources."datasources\.yaml".datasources[4].url=${API_GATEWAY_URL} \ --set grafana.datasources."datasources\.yaml".datasources[5].user=${DS_AGGREGATOR_USERNAME} \ --set grafana.datasources."datasources\.yaml".datasources[5].database=${DS_AGGREGATOR_DATABASE} \ --set grafana.datasources."datasources\.yaml".datasources[5].secureJsonData.password=${DS_AGGREGATOR_PASSWORD} diff --git a/helm/grafana/README.md b/helm/grafana/README.md index 0727d2a..c5c2249 100644 --- a/helm/grafana/README.md +++ b/helm/grafana/README.md @@ -31,6 +31,7 @@ make upgrade ``` - please find the SSO client credentials of the integration `#4492 SSO Dashboard` via [CSS app](https://bcgov.github.io/sso-requests): +- The variables for LOKI_AUTH_TOKEN and API_GATEWAY_URL can be found in the tools namespace under the loki-auth-token secret ### Uninstalling the Chart diff --git a/helm/grafana/values-e4ca1d-tools.yaml b/helm/grafana/values-e4ca1d-tools.yaml index 55f82e8..f441d11 100644 --- a/helm/grafana/values-e4ca1d-tools.yaml +++ b/helm/grafana/values-e4ca1d-tools.yaml @@ -88,9 +88,13 @@ grafana: type: loki access: proxy orgId: 1 - url: http://sso-loki-gateway.e4ca1d-prod.svc.cluster.local + url: basicAuth: false isDefault: true + jsonData: + httpHeaderName1: "Authorization" + secureJsonData: + httpHeaderValue1: - name: SSO Aggregator type: postgres access: proxy diff --git a/helm/promtail-loki/.env.example b/helm/promtail-loki/.env.example new file mode 100644 index 0000000..85dfb8c --- /dev/null +++ b/helm/promtail-loki/.env.example @@ -0,0 +1,2 @@ +LOKI_AUTH_TOKEN= +API_GATEWAY_URL= diff --git a/helm/promtail-loki/Makefile b/helm/promtail-loki/Makefile index 0ef32b8..e992dc7 100644 --- a/helm/promtail-loki/Makefile +++ b/helm/promtail-loki/Makefile @@ -1,3 +1,5 @@ +include /$(PWD)/.env + SHELL := /usr/bin/env bash NAMESPACE="" NAME=sso-promtail-loki @@ -7,7 +9,9 @@ $(error NAMESPACE is not set) endif define arguments - "${NAME}" . -n "${NAMESPACE}" -f values.yaml -f "values-${NAMESPACE}.yaml" + "${NAME}" . -n "${NAMESPACE}" -f values.yaml -f "values-${NAMESPACE}.yaml" \ + --set promtail.config.clients[0].headers.Authorization="Bearer ${LOKI_AUTH_TOKEN}" + --set promtail.config.clients[0].url="${API_GATEWAY_URL}/loki/api/v1/push" endef .PHONY: helm-dep diff --git a/helm/promtail-loki/README.md b/helm/promtail-loki/README.md index 84f834e..a8c57c0 100644 --- a/helm/promtail-loki/README.md +++ b/helm/promtail-loki/README.md @@ -12,6 +12,8 @@ For grafana loki we have found that keeping a minimal label set is ideal for per ## Local deployment via Helm chart +Create a .env file following the variables in .env.example. See the tools-namespace secrets for the loki auth token and api gateway url under the loki-auth-token secret. + ### Installing/Upgrading the Chart ```sh diff --git a/helm/promtail-loki/values-e4ca1d-dev.yaml b/helm/promtail-loki/values-e4ca1d-dev.yaml index 7b1ef15..fe7edb7 100644 --- a/helm/promtail-loki/values-e4ca1d-dev.yaml +++ b/helm/promtail-loki/values-e4ca1d-dev.yaml @@ -8,8 +8,10 @@ promtail: config: logLevel: info clients: - - url: http://sso-loki-gateway.e4ca1d-prod.svc.cluster.local/loki/api/v1/push + - url: tenant_id: sso-team + headers: + Authorization: snippets: scrapeConfigs: | {{- tpl .Values.ssoScrapeConfigs . }} diff --git a/helm/promtail-loki/values-e4ca1d-prod.yaml b/helm/promtail-loki/values-e4ca1d-prod.yaml index f5b92af..f5f14b2 100644 --- a/helm/promtail-loki/values-e4ca1d-prod.yaml +++ b/helm/promtail-loki/values-e4ca1d-prod.yaml @@ -8,8 +8,10 @@ promtail: config: logLevel: info clients: - - url: http://sso-loki-gateway.e4ca1d-prod.svc.cluster.local/loki/api/v1/push + - url: tenant_id: sso-team + headers: + Authorization: snippets: scrapeConfigs: | {{- tpl .Values.ssoScrapeConfigs . }} diff --git a/helm/promtail-loki/values-e4ca1d-test.yaml b/helm/promtail-loki/values-e4ca1d-test.yaml index 8b4eb55..43fdf7b 100644 --- a/helm/promtail-loki/values-e4ca1d-test.yaml +++ b/helm/promtail-loki/values-e4ca1d-test.yaml @@ -8,8 +8,10 @@ promtail: config: logLevel: info clients: - - url: http://sso-loki-gateway.e4ca1d-prod.svc.cluster.local/loki/api/v1/push + - url: tenant_id: sso-team + headers: + Authorization: snippets: scrapeConfigs: | {{- tpl .Values.ssoScrapeConfigs . }} diff --git a/loki-authorizer/Makefile b/loki-authorizer/Makefile new file mode 100644 index 0000000..4fe65a3 --- /dev/null +++ b/loki-authorizer/Makefile @@ -0,0 +1,4 @@ +.PHONY: build +build: + zip -rq loki-authorizer.zip authorize.js + mv loki-authorizer.zip ../terraform-ecs diff --git a/loki-authorizer/authorize.js b/loki-authorizer/authorize.js new file mode 100644 index 0000000..8022315 --- /dev/null +++ b/loki-authorizer/authorize.js @@ -0,0 +1,6 @@ +// See here for reference: https://docs.aws.amazon.com/apigateway/latest/developerguide/http-api-lambda-authorizer.html +module.exports.handler = async (event) => { + const token = event.headers.authorization?.split("Bearer ")?.[1]; + const isAuthorized = token === process.env.AUTH_SECRET + return { isAuthorized } +}; diff --git a/terraform-ecs/.terraform.lock.hcl b/terraform-ecs/.terraform.lock.hcl new file mode 100644 index 0000000..dc75bed --- /dev/null +++ b/terraform-ecs/.terraform.lock.hcl @@ -0,0 +1,24 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/aws" { + version = "5.70.0" + hashes = [ + "h1:LKnWZnujHcQPm3MAk4elP3H9VXNjlO6rNqlO5s330Yg=", + "zh:09cbec93c324e6f03a866244ecb2bae71fdf1f5d3d981e858b745c90606b6b6d", + "zh:19685d9f4c9ddcfa476a9a428c6c612be4a1b4e8e1198fbcbb76436b735284ee", + "zh:3358ee6a2b24c982b7c83fac0af6898644d1bbdabf9c4e0589e91e427641ba88", + "zh:34f9f2936de7384f8ed887abdbcb54aea1ce7b0cf2e85243a3fd3904d024747f", + "zh:4a99546cc2140304c90d9ccb9db01589d4145863605a0fcd90027a643ea3ec5d", + "zh:4da32fec0e10dab5aa3dea3c9fe57adc973cc73a71f5d59da3f65d85d925dc3f", + "zh:659cf94522bc38ce0af70f7b0371b2941a0e0bcad02d17c1a7b264575fe07224", + "zh:6f1c172c9b98bc86e4f0526872098ee3246c2620f7b323ce0c2ce6427987f7d2", + "zh:79bf8fb8f37c308742e287694a9de081ff8502b065a390d1bcfbd241b4eca203", + "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425", + "zh:b7a5e1dfd9e179d70a169ddd4db44b56da90309060e27d36b329fe5fb3528e29", + "zh:c2cc728cb18ffd5c4814a10c203452c71f5ab0c46d68f9aa9183183fa60afd87", + "zh:c89bb37d2b8947c9a0d62b0b86ace51542f3327970f4e56a68bf81d9d0b8b65b", + "zh:ef2a61e8112c3b5e70095508aadaadf077e904b62b9cfc22030337f773bba041", + "zh:f714550b858d141ea88579f25247bda2a5ba461337975e77daceaf0bb7a9c358", + ] +} diff --git a/terraform-ecs/api_gateway.tf b/terraform-ecs/api_gateway.tf new file mode 100644 index 0000000..d43b75d --- /dev/null +++ b/terraform-ecs/api_gateway.tf @@ -0,0 +1,44 @@ +resource "aws_apigatewayv2_api" "sso_loki_api" { + name = "loki-api" + protocol_type = "HTTP" +} + +resource "aws_apigatewayv2_vpc_link" "loki_vpc_link" { + name = "loki_vpc_link" + subnet_ids = [data.aws_subnet.subnet_a.id, data.aws_subnet.subnet_b.id] + security_group_ids = [aws_security_group.loki_sg.id] +} + +resource "aws_apigatewayv2_integration" "sso_loki_api_integration" { + api_id = aws_apigatewayv2_api.sso_loki_api.id + integration_type = "HTTP_PROXY" + connection_id = aws_apigatewayv2_vpc_link.loki_vpc_link.id + connection_type = "VPC_LINK" + integration_method = "ANY" + integration_uri = aws_lb_listener.loki_listener.arn +} + +resource "aws_apigatewayv2_route" "sso_grafana_route_any" { + api_id = aws_apigatewayv2_api.sso_loki_api.id + route_key = "ANY /{proxy+}" + target = "integrations/${aws_apigatewayv2_integration.sso_loki_api_integration.id}" + + authorization_type = "CUSTOM" + authorizer_id = aws_apigatewayv2_authorizer.loki_authorizer.id +} + +resource "aws_apigatewayv2_authorizer" "loki_authorizer" { + api_id = aws_apigatewayv2_api.sso_loki_api.id + authorizer_type = "REQUEST" + enable_simple_responses = true + authorizer_uri = aws_lambda_function.auth_function.invoke_arn + identity_sources = ["$request.header.Authorization"] + name = "loki-authorizer" + authorizer_payload_format_version = "2.0" +} + +resource "aws_apigatewayv2_stage" "sso_grafana_api_default_stage" { + api_id = aws_apigatewayv2_api.sso_loki_api.id + name = "$default" + auto_deploy = true +} diff --git a/terraform-ecs/ecs.tf b/terraform-ecs/ecs.tf new file mode 100644 index 0000000..d3e3506 --- /dev/null +++ b/terraform-ecs/ecs.tf @@ -0,0 +1,255 @@ +resource "aws_ecs_cluster" "sso_ecs_cluster" { + name = "loki-cluster" +} + +resource "aws_ecs_task_definition" "loki_write" { + family = "loki-task" + execution_role_arn = aws_iam_role.loki_execution_role.arn + task_role_arn = aws_iam_role.loki_task_role.arn + network_mode = "awsvpc" + requires_compatibilities = ["FARGATE"] + cpu = "256" + memory = "512" + + container_definitions = jsonencode([{ + name = "loki-write" + image = "ghcr.io/bcgov/sso-loki:${var.loki_tag}" + essential = true + memory = var.loki_write_memory + cpu = var.loki_write_cpu + # IMPORTANT: Make sure ingesters have time to cut any chunks in memory. + stop_timeout = 120 + + portMappings = [ + { + name = "app" + containerPort = 3100 + hostPort = 3100 + }, + { + name = "gossip" + containerPort = 7946 + hostPort = 7946 + protocol = "tcp" + }, + { + name = "grpc" + hostPort = 9095 + protocol = "tcp" + containerPort = 9095 + } + ] + + logConfiguration = { + logDriver = "awslogs" + options = { + awslogs-create-group = "true" + awslogs-group = "/ecs/${aws_ecs_cluster.sso_ecs_cluster.name}" + awslogs-region = "ca-central-1" + awslogs-stream-prefix = "ecs-loki-write" + } + } + environment = [ + { + name = "S3_BUCKETNAME" + value = aws_s3_bucket.sso_loki.bucket + }, + { + name = "S3_REGION" + value = var.region + }, + { + name = "S3_ENDPOINT" + value = "s3.${var.region}.amazonaws.com" + }, + { + name = "JOIN_MEMBERS" + value = "${aws_lb.loki_gossip_lb.dns_name}:7946" + }, + ] + command = [ + "-target=write", + "-config.file=/etc/loki/config/config.yaml", + "-config.expand-env=true", + ] + }]) +} + +resource "aws_ecs_service" "loki_write" { + name = "loki-write-service" + cluster = aws_ecs_cluster.sso_ecs_cluster.id + task_definition = aws_ecs_task_definition.loki_write.arn + desired_count = 2 + launch_type = "FARGATE" + enable_execute_command = true + + load_balancer { + target_group_arn = aws_lb_target_group.loki_target_group_write.id + container_name = "loki-write" + container_port = 3100 + } + + load_balancer { + target_group_arn = aws_lb_target_group.loki_target_group_gossip.id + container_name = "loki-write" + container_port = 7946 + } + + network_configuration { + subnets = [data.aws_subnet.subnet_a.id, data.aws_subnet.subnet_b.id] + security_groups = [aws_security_group.loki_sg.id] + assign_public_ip = false + } +} + +resource "aws_ecs_task_definition" "loki_read" { + family = "loki-task" + execution_role_arn = aws_iam_role.loki_execution_role.arn + task_role_arn = aws_iam_role.loki_task_role.arn + network_mode = "awsvpc" + requires_compatibilities = ["FARGATE"] + cpu = "256" + memory = "512" + + container_definitions = jsonencode([{ + name = "loki-read" + image = "ghcr.io/bcgov/sso-loki:${var.loki_tag}" + essential = true + memory = var.loki_read_memory + cpu = var.loki_read_cpu + + portMappings = [ + { + containerPort = 3100 + hostPort = 3100 + }, + { + containerPort = 8500 + hostPort = 8500 + }, + { + name = "grpc" + hostPort = 9095 + protocol = "tcp" + containerPort = 9095 + }, + { + containerPort = 7946 + hostPort = 7946 + protocol : "tcp" + }, + ] + + environment = [ + { + name = "S3_BUCKETNAME" + value = aws_s3_bucket.sso_loki.bucket + }, + { + name = "S3_REGION" + value = var.region + }, + { + name = "S3_ENDPOINT" + value = "s3.${var.region}.amazonaws.com" + }, + { + name = "JOIN_MEMBERS" + value = "${aws_lb.loki_gossip_lb.dns_name}:7946" + }, + ] + + logConfiguration = { + logDriver = "awslogs" + options = { + awslogs-create-group = "true" + awslogs-group = "/ecs/${aws_ecs_cluster.sso_ecs_cluster.name}" + awslogs-region = var.region + awslogs-stream-prefix = "ecs-loki-read" + } + } + command = [ + "-target=read", + "-config.file=/etc/loki/config/config.yaml", + "-config.expand-env=true", + ] + }]) +} + +resource "aws_ecs_service" "loki_read" { + name = "loki-read-service" + cluster = aws_ecs_cluster.sso_ecs_cluster.id + task_definition = aws_ecs_task_definition.loki_read.arn + desired_count = 2 + launch_type = "FARGATE" + enable_execute_command = true + + load_balancer { + target_group_arn = aws_lb_target_group.loki_target_group_read.id + container_name = "loki-read" + container_port = 3100 + } + + load_balancer { + target_group_arn = aws_lb_target_group.loki_target_group_gossip.id + container_name = "loki-read" + container_port = 7946 + } + + network_configuration { + subnets = [data.aws_subnet.subnet_a.id, data.aws_subnet.subnet_b.id] + security_groups = [aws_security_group.loki_sg.id] + assign_public_ip = false + } +} + +resource "aws_appautoscaling_target" "ecs_read_service_target" { + max_capacity = 6 + min_capacity = 2 + resource_id = "service/${aws_ecs_cluster.sso_ecs_cluster.name}/${aws_ecs_service.loki_read.name}" + scalable_dimension = "ecs:service:DesiredCount" + service_namespace = "ecs" +} + +resource "aws_appautoscaling_policy" "scale_out" { + name = "scale_out" + policy_type = "StepScaling" + resource_id = aws_appautoscaling_target.ecs_read_service_target.id + scalable_dimension = "ecs:service:DesiredCount" + service_namespace = "ecs" + + step_scaling_policy_configuration { + adjustment_type = "ChangeInCapacity" + step_adjustment { + scaling_adjustment = 1 # Increase the number of tasks by + # The bounds are for the difference between the trigger and the actual value + # e.g if alarm is at 60%, and you want a step for 60% - 80%, the lower bound would be zero and the upper 20. + metric_interval_lower_bound = 0 + metric_interval_upper_bound = 20 + } + step_adjustment { + scaling_adjustment = 2 + metric_interval_lower_bound = 20 + } + cooldown = 30 + } +} + +resource "aws_cloudwatch_metric_alarm" "cpu_high" { + alarm_name = "HighCpuAlarm" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = 1 + metric_name = "CPUUtilization" + namespace = "AWS/ECS" + period = 30 + statistic = "Average" + threshold = 60 + dimensions = { + ClusterName = aws_ecs_cluster.sso_ecs_cluster.name + ServiceName = aws_ecs_service.loki_read.name + } + + alarm_actions = [ + aws_appautoscaling_policy.scale_out.arn, + ] +} diff --git a/terraform-ecs/lambda-authorizer.tf b/terraform-ecs/lambda-authorizer.tf new file mode 100644 index 0000000..d7c89e4 --- /dev/null +++ b/terraform-ecs/lambda-authorizer.tf @@ -0,0 +1,33 @@ +resource "aws_lambda_function" "auth_function" { + function_name = "lokiApiAuth" + runtime = "nodejs20.x" + role = aws_iam_role.lambda_exec.arn + handler = "authorize.handler" + filename = "loki-authorizer.zip" + + source_code_hash = filebase64sha256("./loki-authorizer.zip") + + environment { + variables = { + AUTH_SECRET = var.auth_secret + } + } +} + +resource "aws_iam_role" "lambda_exec" { + name = "lambda-exec-role" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Action = "sts:AssumeRole" + Principal = { + Service = "lambda.amazonaws.com" + } + Effect = "Allow" + Sid = "" + } + ] + }) +} diff --git a/terraform-ecs/network.tf b/terraform-ecs/network.tf new file mode 100644 index 0000000..fb81c01 --- /dev/null +++ b/terraform-ecs/network.tf @@ -0,0 +1,184 @@ +# Select pre-existing networking config into data for use in resources. +data "aws_vpc" "selected" { + state = "available" +} + +data "aws_subnet" "subnet_a" { + filter { + name = "tag:Name" + values = [var.subnet_a] + } +} + +data "aws_subnet" "subnet_b" { + filter { + name = "tag:Name" + values = [var.subnet_b] + } +} + +# Open for api gateway to receive traffic from internet, e.g our openshift promtail. Authorization token is checked on all calls +resource "aws_security_group" "loki_sg" { + name = "loki_sg" + description = "Security group for loki" + vpc_id = data.aws_vpc.selected.id + + ingress { + from_port = 0 + to_port = 0 + protocol = -1 + cidr_blocks = ["0.0.0.0/0"] + } + + egress { + from_port = 0 + to_port = 0 + protocol = -1 + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_lb" "loki_lb" { + name = "loki-lb" + internal = true + load_balancer_type = "application" + security_groups = [aws_security_group.loki_sg.id] + subnets = [data.aws_subnet.subnet_a.id, data.aws_subnet.subnet_b.id] + enable_cross_zone_load_balancing = true +} + +resource "aws_lb" "loki_gossip_lb" { + name = "loki-gossip-lb" + internal = true + load_balancer_type = "network" + security_groups = [aws_security_group.loki_sg.id] + subnets = [data.aws_subnet.subnet_a.id, data.aws_subnet.subnet_b.id] + enable_cross_zone_load_balancing = true +} + +resource "aws_lb_listener" "loki_gossip_listener" { + load_balancer_arn = aws_lb.loki_gossip_lb.arn + port = "7946" + protocol = "TCP" + + default_action { + type = "forward" + target_group_arn = aws_lb_target_group.loki_target_group_gossip.arn + } + + depends_on = [aws_lb.loki_gossip_lb] +} + +resource "aws_lb_listener" "loki_listener" { + load_balancer_arn = aws_lb.loki_lb.arn + port = "80" + protocol = "HTTP" + + default_action { + type = "forward" + target_group_arn = aws_lb_target_group.loki_target_group_read.arn + } + + depends_on = [aws_lb.loki_lb] +} + +# Each ecs container set needs its rules +resource "aws_lb_listener_rule" "write_service_rule" { + listener_arn = aws_lb_listener.loki_listener.arn + priority = 98 + + action { + type = "forward" + target_group_arn = aws_lb_target_group.loki_target_group_write.arn + } + + condition { + path_pattern { + values = [ + "/api/prom/push", + "/loki/api/v1/push", + "/distributor/ring", + "/flush", + "/ingester*", + ] + } + } +} + +resource "aws_lb_listener_rule" "write_service_rule_2" { + listener_arn = aws_lb_listener.loki_listener.arn + priority = 97 + + action { + type = "forward" + target_group_arn = aws_lb_target_group.loki_target_group_write.arn + } + + condition { + path_pattern { + values = [ + "/ring", + "/memberlist", + ] + } + } +} + +# Unmatched paths all go to the read service +resource "aws_lb_listener_rule" "read_service_rule" { + listener_arn = aws_lb_listener.loki_listener.arn + priority = 100 + + action { + type = "forward" + target_group_arn = aws_lb_target_group.loki_target_group_read.arn + } + + condition { + path_pattern { + values = [ + "/*", + ] + } + } +} + +resource "aws_lb_target_group" "loki_target_group_read" { + name = "loki-target-group-read" + port = 3100 + protocol = "HTTP" + vpc_id = data.aws_vpc.selected.id + target_type = "ip" + + health_check { + path = "/ready" + interval = 30 + timeout = 5 + healthy_threshold = 3 + unhealthy_threshold = 3 + } +} + +resource "aws_lb_target_group" "loki_target_group_write" { + name = "loki-target-group-write" + port = 3100 + protocol = "HTTP" + vpc_id = data.aws_vpc.selected.id + target_type = "ip" + + health_check { + path = "/ready" + interval = 30 + timeout = 5 + healthy_threshold = 3 + unhealthy_threshold = 3 + } +} + +resource "aws_lb_target_group" "loki_target_group_gossip" { + name = "loki-target-group-gossip" + port = 7946 + protocol = "TCP" + vpc_id = data.aws_vpc.selected.id + target_type = "ip" +} diff --git a/terraform-ecs/providers.tf b/terraform-ecs/providers.tf new file mode 100644 index 0000000..7be38a2 --- /dev/null +++ b/terraform-ecs/providers.tf @@ -0,0 +1,7 @@ +provider "aws" { + region = var.region +} + +terraform { + backend "s3" {} +} diff --git a/terraform-ecs/roles.tf b/terraform-ecs/roles.tf new file mode 100644 index 0000000..88edc02 --- /dev/null +++ b/terraform-ecs/roles.tf @@ -0,0 +1,122 @@ +# Execution role, permissions to log to cloudwatch +resource "aws_iam_role" "loki_execution_role" { + name = "loki-execution-role" + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Service = "ecs-tasks.amazonaws.com" + } + Action = "sts:AssumeRole" + } + ] + }) +} + +resource "aws_iam_policy" "loki_execution_policy" { + name = "loki-execution-policy" + description = "Permissions for ECS task execution, including logging to CloudWatch" + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "logs:CreateLogGroup", + "logs:CreateLogStream", + "logs:PutLogEvents" + ] + Resource = "*" + } + ] + }) +} + +resource "aws_iam_role_policy_attachment" "loki_execution_role_attachment" { + role = aws_iam_role.loki_execution_role.name + policy_arn = aws_iam_policy.loki_execution_policy.arn +} + +# Task role for running container, needs to connect to the s3 bucket for logs +resource "aws_iam_role" "loki_task_role" { + name = "loki-task-role" + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Service = "ecs-tasks.amazonaws.com" + } + Action = "sts:AssumeRole" + } + ] + }) +} + +resource "aws_iam_policy" "loki_task_s3_policy" { + name = "loki-task-policy-s3" + description = "Permissions for Loki to access S3" + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "s3:GetObject", + "s3:ListBucket", + "s3:PutObject", + "s3:DeleteObject", + "s3:*" + ] + Resource = [ + "arn:aws:s3:::${aws_s3_bucket.sso_loki.bucket}", + "arn:aws:s3:::${aws_s3_bucket.sso_loki.bucket}/*", + ] + }, + ] + }) +} + +resource "aws_iam_role_policy_attachment" "loki_task_s3_role_attachment" { + role = aws_iam_role.loki_task_role.name + policy_arn = aws_iam_policy.loki_task_s3_policy.arn +} + +# Allow api gateway to use lambdas +resource "aws_lambda_permission" "allow_api_gateway" { + statement_id = "AllowExecutionFromAPIGateway" + action = "lambda:InvokeFunction" + function_name = aws_lambda_function.auth_function.function_name + principal = "apigateway.amazonaws.com" + source_arn = "${aws_apigatewayv2_api.sso_loki_api.execution_arn}/*/*" +} + +# Below permission can be added temporarily if needing to ssh into the loki tasks for debugging. + +# resource "aws_iam_policy" "loki_task_ssh_policy" { +# name = "loki-task-policy-efs" +# description = "Permissions to ssh to loki tasks" +# policy = jsonencode({ +# Version = "2012-10-17" +# Statement = [ +# { +# Effect = "Allow" +# Action = [ +# "ssmmessages:CreateControlChannel", +# "ssmmessages:CreateDataChannel", +# "ssmmessages:OpenControlChannel", +# "ssmmessages:OpenDataChannel" +# ] +# Resource = "*" +# }, +# ] +# }) +# } + +# resource "aws_iam_role_policy_attachment" "loki_task_efs_role_attachment" { +# role = aws_iam_role.loki_task_role.name +# policy_arn = aws_iam_policy.loki_task_ssh_policy.arn +# } diff --git a/terraform-ecs/s3.tf b/terraform-ecs/s3.tf new file mode 100644 index 0000000..bff2646 --- /dev/null +++ b/terraform-ecs/s3.tf @@ -0,0 +1,3 @@ +resource "aws_s3_bucket" "sso_loki" { + bucket = var.bucket_name +} diff --git a/terraform-ecs/variables.tf b/terraform-ecs/variables.tf new file mode 100644 index 0000000..ba13501 --- /dev/null +++ b/terraform-ecs/variables.tf @@ -0,0 +1,55 @@ +variable "subnet_a" { + type = string + description = "Value of the name tag for the app subnet in AZ a" + default = "Web_Dev_aza_net" +} + +variable "subnet_b" { + type = string + description = "Value of the name tag for the app subnet in AZ b" + default = "Web_Dev_azb_net" +} + +variable "region" { + type = string + default = "ca-central-1" +} + +variable "auth_secret" { + type = string + description = "Authentication secret to use loki API" + sensitive = true +} + +variable "loki_read_cpu" { + type = number + description = "CPU as vCPU, e.g. 1000 = 1cpu" + default = 256 +} + +variable "loki_write_cpu" { + type = number + description = "CPU as vCPU, e.g. 1000 = 1cpu" + default = 256 +} + +variable "loki_read_memory" { + type = number + description = "Memory in Mb" + default = 512 +} + +variable "loki_write_memory" { + type = number + description = "Memory in Mb" + default = 512 +} + +variable "bucket_name" { + type = string +} + +variable "loki_tag" { + type = string + default = "dev" +}