-
Notifications
You must be signed in to change notification settings - Fork 0
software_lifecycle
Arne Molland edited this page Sep 16, 2025
·
1 revision
fleetd provides complete software lifecycle management for edge applications, from build artifacts to production deployment with monitoring and rollback capabilities.
Software packages that can be deployed to devices:
- Binaries: Standalone executables
- OCI Images: Docker/Podman containers
- Nixpacks: Reproducible Nix packages
- Archives: tar.gz, zip with scripts
- System Packages: deb, rpm, apk
Logical groupings of artifacts with:
- Configuration
- Runtime requirements
- Health checks
- Resource limits
- Restart policies
Controlled rollout of applications to device fleets:
- Target selection (device groups, tags, capabilities)
- Rollout strategy (canary, rolling, blue-green)
- Success criteria
- Rollback triggers
graph TB
subgraph "CI/CD Integration"
GHA[GitHub Actions]
GL[GitLab CI]
JK[Jenkins]
API[fleetd API]
end
subgraph "fleetd Control Plane"
AM[Artifact Manager]
AS[Artifact Storage]
DM[Deployment Manager]
SM[Strategy Engine]
MM[Metrics Monitor]
end
subgraph "Edge Devices"
AG1[fleetd Agent 1]
AG2[fleetd Agent 2]
AG3[fleetd Agent N]
P1[App Process 1]
P2[App Process 2]
P3[App Process N]
end
GHA --> API
GL --> API
JK --> API
API --> AM
AM --> AS
AM --> DM
DM --> SM
SM --> AG1
SM --> AG2
SM --> AG3
AG1 --> P1
AG2 --> P2
AG3 --> P3
P1 --> MM
P2 --> MM
P3 --> MM
MM --> DM
type ArtifactStorage interface {
// Store artifact with versioning
Store(ctx context.Context, artifact *Artifact) (string, error)
// Retrieve specific version
Get(ctx context.Context, name, version string) (*Artifact, error)
// List versions
ListVersions(ctx context.Context, name string) ([]Version, error)
// Generate signed download URL
GetDownloadURL(ctx context.Context, artifact *Artifact) (string, error)
}
// Implementations
- S3/MinIO for cloud/self-hosted
- Local filesystem for development
- OCI registry for container images
- IPFS for distributed storagemessage Artifact {
string id = 1;
string name = 2;
string version = 3;
ArtifactType type = 4;
// Build metadata
BuildInfo build_info = 5;
// Runtime requirements
Requirements requirements = 6;
// Signatures for verification
repeated Signature signatures = 7;
// Size and checksums
int64 size = 8;
map<string, string> checksums = 9; // sha256, sha512, etc.
// Storage location
string storage_url = 10;
// Platform compatibility
repeated Platform platforms = 11;
google.protobuf.Timestamp created_at = 12;
map<string, string> labels = 13;
}
message Requirements {
// Minimum resources
int64 min_memory = 1;
int64 min_storage = 2;
int32 min_cpu_mhz = 3;
// Required capabilities
repeated string capabilities = 4; // "docker", "systemd", "gpu"
// OS constraints
string os_family = 5; // "linux", "rtos"
string min_kernel = 6; // "5.10"
repeated string arch = 7; // ["arm64", "amd64"]
}# app.yaml - Application manifest
apiVersion: fleet.v1
kind: Application
metadata:
name: edge-inference
version: 2.1.0
spec:
artifacts:
- name: inference-engine
version: 2.1.0
type: binary
- name: model-server
version: 1.5.0
type: oci-image
configuration:
env:
MODEL_PATH: /data/models
API_PORT: 8080
files:
- path: /etc/inference/config.yaml
content: |
model:
type: yolov5
threshold: 0.7
runtime:
type: systemd # or docker, supervisor, raw
restart: always
startup_timeout: 30s
healthcheck:
http:
path: /health
port: 8080
interval: 30s
timeout: 5s
retries: 3
resources:
limits:
memory: 2Gi
cpu: 2000m
requests:
memory: 1Gi
cpu: 500m
placement:
device_selector:
type: raspberry-pi
min_memory: 4Gi
node_affinity:
- key: location
operator: In
values: ["edge", "store"]type CanaryStrategy struct {
InitialPercentage int32
IncrementPercentage int32
IncrementInterval time.Duration
SuccessCriteria SuccessCriteria
RollbackTriggers []RollbackTrigger
}
func (c *CanaryStrategy) Execute(deployment *Deployment) error {
// Start with small percentage
targets := selectDevices(deployment.TargetDevices, c.InitialPercentage)
for _, batch := range c.generateBatches() {
// Deploy to batch
if err := deployToBatch(batch); err != nil {
return c.rollback(deployment, err)
}
// Monitor metrics
if !c.checkSuccess(batch) {
return c.rollback(deployment, ErrCanaryFailed)
}
// Wait before next batch
time.Sleep(c.IncrementInterval)
}
return nil
}type BlueGreenStrategy struct {
TrafficSwitchDelay time.Duration
ValidationPeriod time.Duration
AutoRollback bool
}
func (bg *BlueGreenStrategy) Execute(deployment *Deployment) error {
// Deploy to "green" environment
green := deployment.CreateGreenEnvironment()
if err := green.Deploy(); err != nil {
return err
}
// Run validation
if err := green.Validate(bg.ValidationPeriod); err != nil {
green.Cleanup()
return err
}
// Switch traffic
if err := deployment.SwitchTraffic(green); err != nil {
if bg.AutoRollback {
deployment.SwitchTraffic(deployment.Blue)
}
return err
}
// Cleanup old version after success
time.AfterFunc(bg.TrafficSwitchDelay, func() {
deployment.Blue.Cleanup()
})
return nil
}type RollingUpdateStrategy struct {
MaxUnavailable int32
MaxSurge int32
UpdateBatchSize int32
HealthCheckGracePeriod time.Duration
}type ProcessManager struct {
processes map[string]*ManagedProcess
mu sync.RWMutex
}
type ManagedProcess struct {
App *Application
Cmd *exec.Cmd
StartTime time.Time
RestartCount int32
// Monitoring
metrics *ProcessMetrics
logs *LogCollector
healthcheck *HealthChecker
}
func (p *ManagedProcess) Start() error {
// Prepare environment
p.Cmd.Env = p.App.GetEnvironment()
// Set resource limits
p.Cmd.SysProcAttr = &syscall.SysProcAttr{
Setpgid: true,
Rlimit: []syscall.Rlimit{
{Resource: syscall.RLIMIT_AS, Cur: p.App.Resources.Memory},
{Resource: syscall.RLIMIT_CPU, Cur: p.App.Resources.CPU},
},
}
// Start process
if err := p.Cmd.Start(); err != nil {
return err
}
// Start monitoring
go p.collectMetrics()
go p.streamLogs()
go p.checkHealth()
return nil
}
func (p *ManagedProcess) collectMetrics() {
ticker := time.NewTicker(10 * time.Second)
defer ticker.Stop()
for range ticker.C {
stats := p.getProcessStats()
p.metrics.Record(ProcessStats{
PID: p.Cmd.Process.Pid,
CPUPercent: stats.CPUPercent(),
Memory: stats.MemoryInfo(),
FDs: stats.NumFDs(),
Threads: stats.NumThreads(),
IOCounters: stats.IOCounters(),
})
}
}type RollbackTrigger struct {
Type TriggerType
Threshold float64
Duration time.Duration
Action RollbackAction
}
var DefaultTriggers = []RollbackTrigger{
{
Type: CrashLoopBackoff,
Threshold: 3, // restarts
Duration: 5 * time.Minute,
Action: ImmediateRollback,
},
{
Type: HighErrorRate,
Threshold: 0.05, // 5% error rate
Duration: 2 * time.Minute,
Action: GradualRollback,
},
{
Type: HealthCheckFailure,
Threshold: 0.5, // 50% failing
Duration: 1 * time.Minute,
Action: ImmediateRollback,
},
}
func (d *Deployment) MonitorForRollback() {
for _, trigger := range d.RollbackTriggers {
go func(t RollbackTrigger) {
if t.ShouldTrigger(d.GetMetrics()) {
d.InitiateRollback(t.Action)
}
}(trigger)
}
}name: Deploy to Fleet
on:
push:
tags:
- 'v*'
jobs:
build-and-deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Build Application
run: |
cargo build --release --target aarch64-unknown-linux-gnu
- name: Upload to fleetd
uses: fleetd/upload-artifact@v1
with:
endpoint: ${{ secrets.FLEETD_ENDPOINT }}
api_key: ${{ secrets.FLEETD_API_KEY }}
name: my-app
version: ${{ github.ref_name }}
file: target/release/my-app
platforms: linux/arm64
- name: Create Deployment
uses: fleetd/deploy@v1
with:
application: my-app
version: ${{ github.ref_name }}
strategy: canary
target_group: production
canary_percentage: 10
increment: 20
interval: 5m# Upload artifact
curl -X POST https://fleet.example.com/api/v1/artifacts \
-H "Authorization: Bearer $API_KEY" \
-F "file=@myapp-v2.0.0-arm64" \
-F "metadata={
\"name\": \"myapp\",
\"version\": \"2.0.0\",
\"platform\": \"linux/arm64\",
\"type\": \"binary\"
}"
# Create deployment
curl -X POST https://fleet.example.com/api/v1/deployments \
-H "Authorization: Bearer $API_KEY" \
-d '{
"application": "myapp",
"version": "2.0.0",
"strategy": {
"type": "rolling",
"max_unavailable": 1,
"health_check_grace": "30s"
},
"target": {
"group": "edge-stores",
"selector": {
"location": "us-west"
}
}
}'message AppMetrics {
string app_id = 1;
string device_id = 2;
// Process metrics
float cpu_percent = 3;
uint64 memory_bytes = 4;
uint64 disk_io_read = 5;
uint64 disk_io_write = 6;
uint64 network_rx = 7;
uint64 network_tx = 8;
// Application metrics
uint32 restart_count = 9;
uint64 uptime_seconds = 10;
map<string, double> custom_metrics = 11;
// Health status
HealthStatus health = 12;
string health_message = 13;
google.protobuf.Timestamp timestamp = 14;
}type LogStreamer struct {
app *Application
buffer *ring.Ring
subs []chan LogEntry
}
func (l *LogStreamer) Stream(ctx context.Context) {
scanner := bufio.NewScanner(l.app.Stdout)
for scanner.Scan() {
entry := LogEntry{
Timestamp: time.Now(),
Level: l.parseLevel(scanner.Text()),
Message: scanner.Text(),
AppID: l.app.ID,
}
// Store in buffer
l.buffer.Value = entry
l.buffer = l.buffer.Next()
// Send to subscribers
for _, sub := range l.subs {
select {
case sub <- entry:
default: // Don't block
}
}
// Send to central logging if configured
if l.app.Config.CentralLogging {
l.sendToCentral(entry)
}
}
}func (a *Artifact) Verify() error {
// Check signatures
for _, sig := range a.Signatures {
if err := sig.Verify(a.Content); err != nil {
return fmt.Errorf("signature verification failed: %w", err)
}
}
// Verify checksums
actual := sha256.Sum256(a.Content)
expected := a.Checksums["sha256"]
if hex.EncodeToString(actual[:]) != expected {
return ErrChecksumMismatch
}
// Scan for vulnerabilities if configured
if scanner := GetVulnerabilityScanner(); scanner != nil {
if vulns := scanner.Scan(a); len(vulns) > 0 {
return fmt.Errorf("vulnerabilities found: %v", vulns)
}
}
return nil
}- Complete Lifecycle Management: From CI/CD to production monitoring
- Multi-Strategy Deployments: Canary, blue-green, rolling updates
- Automatic Rollback: Based on health metrics and error rates
- Platform Agnostic: Supports binaries, containers, packages
- Secure by Default: Signed artifacts, vulnerability scanning
- Observable: Comprehensive metrics and log collection
- Resilient: Handles network failures, partial deployments
- Scalable: Works from 10 to 10,000+ devices