Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions api/v1alpha2/hook_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,13 @@ type EventConfiguration struct {
// +kubebuilder:validation:Required
EventType string `json:"eventType"`

// AgentRef specifies the Kagent agent to call when this event occurs
// +kubebuilder:validation:Required
AgentRef ObjectReference `json:"agentRef"`
// AgentRef specifies the Kagent agent to call when this event occurs (new format)
// +kubebuilder:validation:Optional
AgentRef ObjectReference `json:"agentRef,omitempty"`

// AgentId specifies the Kagent agent to call when this event occurs (legacy format)
// +kubebuilder:validation:Optional
AgentId string `json:"agentId,omitempty"`
Comment on lines +36 to +38
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why was this added back in, this format has been deprecated and has many issues.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess a rebase gone wrong. @DmarshalTU - we switched to AgentRef instead of AgentId to be aligned with the rest of Kagent


// Prompt specifies the prompt template to send to the agent
// +kubebuilder:validation:Required
Expand Down
24 changes: 19 additions & 5 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
kagentv1alpha2 "github.com/kagent-dev/khook/api/v1alpha2"
kclient "github.com/kagent-dev/khook/internal/client"
"github.com/kagent-dev/khook/internal/config"
"github.com/kagent-dev/khook/internal/sre"
"github.com/kagent-dev/khook/internal/workflow"
)

Expand Down Expand Up @@ -76,8 +77,17 @@ func main() {
os.Exit(1)
}

// Start SRE-IDE server
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have an issue with nomenclature - I know you built this specifically for your use case but would like to make stuff generic so other people can potentially build on them. I think just calling it "API server" will work better.

sreServer := sre.NewServer(8082, mgr.GetClient())
go func() {
ctx := context.Background()
if err := sreServer.Start(ctx); err != nil {
setupLog.Error(err, "problem running SRE-IDE server")
}
}()

// Add workflow coordinator to manage hooks and event processing
if err := mgr.Add(newWorkflowCoordinator(mgr)); err != nil {
if err := mgr.Add(newWorkflowCoordinator(mgr, sreServer)); err != nil {
setupLog.Error(err, "unable to add workflow coordinator")
os.Exit(1)
}
Expand All @@ -91,11 +101,15 @@ func main() {

// workflowCoordinator manages the complete workflow lifecycle using proper services
type workflowCoordinator struct {
mgr ctrl.Manager
mgr ctrl.Manager
sreServer *sre.Server
}

func newWorkflowCoordinator(mgr ctrl.Manager) *workflowCoordinator {
return &workflowCoordinator{mgr: mgr}
func newWorkflowCoordinator(mgr ctrl.Manager, sreServer *sre.Server) *workflowCoordinator {
return &workflowCoordinator{
mgr: mgr,
sreServer: sreServer,
}
}

func (w *workflowCoordinator) NeedLeaderElection() bool { return true }
Expand All @@ -121,7 +135,7 @@ func (w *workflowCoordinator) Start(ctx context.Context) error {

// Create workflow coordinator
eventRecorder := w.mgr.GetEventRecorderFor("khook")
coordinator := workflow.NewCoordinator(k8s, w.mgr.GetClient(), kagentCli, eventRecorder)
coordinator := workflow.NewCoordinator(k8s, w.mgr.GetClient(), kagentCli, eventRecorder, w.sreServer)

// Start the coordinator
return coordinator.Start(ctx)
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ require (
github.com/google/gnostic-models v0.7.0 // indirect
github.com/google/go-cmp v0.7.0 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect
github.com/jackc/pgpassfile v1.0.0 // indirect
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
github.com/jackc/pgx/v5 v5.6.0 // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ github.com/google/pprof v0.0.0-20250607225305-033d6d78b36a h1://KbezygeMJZCSHH+H
github.com/google/pprof v0.0.0-20250607225305-033d6d78b36a/go.mod h1:5hDyRhoBCxViHszMt12TnOpEI4VVi+U8Gm9iphldiMA=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 h1:JeSE6pjso5THxAzdVpqr6/geYxZytqFMBCOtn/ujyeo=
github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674/go.mod h1:r4w70xmWCQKmi1ONH4KIaBptdivuRPyosB9RmPlGEwA=
github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM=
github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg=
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo=
Expand Down
1 change: 1 addition & 0 deletions helm/.gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
Chart.yaml
Chart.lock
values-test.yaml
5 changes: 5 additions & 0 deletions helm/khook/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@ spec:
- name: health
containerPort: 8081
protocol: TCP
{{- if .Values.sre.enabled }}
- name: sre-server
containerPort: {{ .Values.sre.port }}
protocol: TCP
{{- end }}
livenessProbe:
{{- toYaml .Values.healthCheck.livenessProbe | nindent 10 }}
readinessProbe:
Expand Down
23 changes: 23 additions & 0 deletions helm/khook/templates/sre-service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{{- if .Values.sre.enabled }}
apiVersion: v1
kind: Service
metadata:
name: {{ include "khook.fullname" . }}-sre
namespace: {{ include "khook.namespace" . }}
labels:
{{- include "khook.labels" . | nindent 4 }}
app.kubernetes.io/component: sre-server
{{- with .Values.sre.service.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
type: {{ .Values.sre.service.type }}
ports:
- name: sre-server
port: {{ .Values.sre.service.port }}
targetPort: sre-server
protocol: TCP
selector:
{{- include "khook.selectorLabels" . | nindent 4 }}
{{- end }}
9 changes: 9 additions & 0 deletions helm/khook/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,15 @@ metrics:
labels: {}
annotations: {}

# SRE API Server configuration
sre:
enabled: true
port: 8082
service:
type: ClusterIP
port: 8082
annotations: {}

# RBAC configuration
rbac:
create: true
Expand Down
2 changes: 1 addition & 1 deletion internal/deduplication/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ const (
// EventTimeoutDuration is the duration after which events are considered resolved
EventTimeoutDuration = 10 * time.Minute
// NotificationSuppressionDuration is the window to suppress re-sending after success
NotificationSuppressionDuration = 10 * time.Minute
NotificationSuppressionDuration = 30 * time.Second
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this needs to be discussed. While the original 10 minutes may be too long, 30 seconds is IMHO too short. Maybe we should at least make this configurable by values/configmap


// StatusFiring indicates an event is currently active
StatusFiring = "firing"
Expand Down
56 changes: 49 additions & 7 deletions internal/pipeline/processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (

"github.com/kagent-dev/khook/api/v1alpha2"
"github.com/kagent-dev/khook/internal/interfaces"
"github.com/kagent-dev/khook/internal/sre"
)

// Processor handles the complete event processing pipeline
Expand All @@ -22,6 +23,7 @@ type Processor struct {
deduplicationManager interfaces.DeduplicationManager
kagentClient interfaces.KagentClient
statusManager interfaces.StatusManager
sreServer interface{}
logger logr.Logger
}

Expand All @@ -31,12 +33,14 @@ func NewProcessor(
deduplicationManager interfaces.DeduplicationManager,
kagentClient interfaces.KagentClient,
statusManager interfaces.StatusManager,
sreServer interface{},
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this an interface{} and not a concrete type?

) *Processor {
return &Processor{
eventWatcher: eventWatcher,
deduplicationManager: deduplicationManager,
kagentClient: kagentClient,
statusManager: statusManager,
sreServer: sreServer,
logger: log.Log.WithName("event-processor"),
}
}
Expand Down Expand Up @@ -133,13 +137,38 @@ func (p *Processor) processEventMatch(ctx context.Context, match EventMatch) err
return fmt.Errorf("failed to record event in deduplication manager: %w", err)
}

agentRefNs := match.Hook.Namespace
if match.Configuration.AgentRef.Namespace != nil {
agentRefNs = *match.Configuration.AgentRef.Namespace
}
agentRef := types.NamespacedName{
Name: match.Configuration.AgentRef.Name,
Namespace: agentRefNs,
// Handle both agentId (legacy) and agentRef (new) formats
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a fan of this as I mentioned above

var agentRef types.NamespacedName
if match.Configuration.AgentRef.Name != "" {
// New format: agentRef
agentRefNs := match.Hook.Namespace
if match.Configuration.AgentRef.Namespace != nil {
agentRefNs = *match.Configuration.AgentRef.Namespace
}
agentRef = types.NamespacedName{
Name: match.Configuration.AgentRef.Name,
Namespace: agentRefNs,
}
} else {
// Legacy format: agentId (parse "namespace/name" format)
agentId := match.Configuration.AgentId
if agentId == "" {
return fmt.Errorf("neither agentRef.name nor agentId is specified")
}

// Parse agentId format: "namespace/name" or just "name"
parts := strings.Split(agentId, "/")
if len(parts) == 2 {
agentRef = types.NamespacedName{
Name: parts[1],
Namespace: parts[0],
}
} else {
agentRef = types.NamespacedName{
Name: parts[0],
Namespace: match.Hook.Namespace,
}
}
}

// Record that the event is firing
Expand Down Expand Up @@ -167,6 +196,19 @@ func (p *Processor) processEventMatch(ctx context.Context, match EventMatch) err
// Continue even if status recording fails
}

// Add alert to SRE server if available
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At a high level, why do we have a callout to the server from here, rather than the server sharing the same datasource as this component?

p.logger.Info("Checking SRE server integration", "sreServer", p.sreServer != nil)
if p.sreServer != nil {
if sreServer, ok := p.sreServer.(*sre.Server); ok {
// Convert event to alert and add to SRE server
alert := sre.ConvertEventToAlert(match.Event, match.Hook, agentRef, response)
sreServer.AddAlert(alert)
p.logger.Info("Added alert to SRE server", "alertId", alert.ID)
} else {
p.logger.Error(nil, "Type assertion failed for SRE server", "sreServerType", fmt.Sprintf("%T", p.sreServer))
}
}

// Mark event as notified to suppress re-sending within suppression window
p.deduplicationManager.MarkNotified(hookRef, match.Event)

Expand Down
Loading