diff --git a/docs/release-notes/release-notes-0.16.1.md b/docs/release-notes/release-notes-0.16.1.md new file mode 100644 index 000000000..391bfaabc --- /dev/null +++ b/docs/release-notes/release-notes-0.16.1.md @@ -0,0 +1,45 @@ +# Release Notes + +- [Lightning Terminal](#lightning-terminal) + - [Bug Fixes](#bug-fixes) + - [Functional Changes/Additions](#functional-changesadditions) + - [Technical and Architectural Updates](#technical-and-architectural-updates) +- [Integrated Binary Updates](#integrated-binary-updates) + - [LND](#lnd) + - [Loop](#loop) + - [Pool](#pool) + - [Faraday](#faraday) + - [Taproot Assets](#taproot-assets) +- [Contributors](#contributors-alphabetical-order) + +## Lightning Terminal + +### Bug Fixes + +### Functional Changes/Additions + +* [PR](https://github.com/lightninglabs/lightning-terminal/pull/1183): LiT now + fails fast if a critical integrated sub-server cannot start. The critical set + currently includes only tapd; other integrated sub-servers can fail to start + without blocking LiT, and their errors are recorded in status. +* Integrated-mode sub-servers now start deterministically: critical integrated + services are launched first, followed by the remaining services in + alphabetical order to keep startup ordering stable across runs. + +### Technical and Architectural Updates + +## RPC Updates + +## Integrated Binary Updates + +### LND + +### Loop + +### Pool + +### Faraday + +### Taproot Assets + +# Contributors (Alphabetical Order) diff --git a/itest/litd_mode_integrated_test.go b/itest/litd_mode_integrated_test.go index f6b9a7ca8..f7fa65278 100644 --- a/itest/litd_mode_integrated_test.go +++ b/itest/litd_mode_integrated_test.go @@ -465,6 +465,38 @@ func testModeIntegrated(ctx context.Context, net *NetworkHarness, ) } +// testCriticalTapStartupFailure ensures LiT exits quickly when a critical +// integrated sub-server (tapd) fails to start during boot. +func testCriticalTapStartupFailure(ctx context.Context, net *NetworkHarness, + t *harnessTest) { + + // Force tapd to bind to an invalid port to guarantee a startup failure + // in integrated mode. + node, err := net.NewNode( + t.t, "FailFastTap", nil, false, false, + "--taproot-assets.rpclisten=127.0.0.1:65536", + ) + require.NoError(t.t, err) + + defer func() { + _ = net.ShutdownNode(node) + }() + + select { + case procErr := <-net.ProcessErrors(): + require.ErrorContains(t.t, procErr, "invalid port") + case <-time.After(15 * time.Second): + t.Fatalf("expected tapd startup failure to be reported") + } + + // LiT should terminate promptly after the critical startup failure. + select { + case <-node.processExit: + case <-time.After(5 * time.Second): + t.Fatalf("litd did not exit after tapd startup failure") + } +} + // integratedTestSuite makes sure that in integrated mode all daemons work // correctly. func integratedTestSuite(ctx context.Context, net *NetworkHarness, t *testing.T, diff --git a/itest/litd_test_list_on_test.go b/itest/litd_test_list_on_test.go index fda3f1653..5ceb06f3f 100644 --- a/itest/litd_test_list_on_test.go +++ b/itest/litd_test_list_on_test.go @@ -87,6 +87,11 @@ var allTestCases = []*testCase{ test: testCustomChannelsHtlcForceCloseMpp, noAliceBob: true, }, + { + name: "critical tap startup failure", + test: testCriticalTapStartupFailure, + noAliceBob: true, + }, { name: "custom channels balance consistency", test: testCustomChannelsBalanceConsistency, diff --git a/subservers/interface.go b/subservers/interface.go index 4ff0083cf..4f5e7c84e 100644 --- a/subservers/interface.go +++ b/subservers/interface.go @@ -14,9 +14,7 @@ import ( // SubServer defines an interface that should be implemented by any sub-server // that the subServer manager should manage. A sub-server can be run in either -// integrated or remote mode. A sub-server is considered non-fatal to LiT -// meaning that if a sub-server fails to start, LiT can safely continue with its -// operations and other sub-servers can too. +// integrated or remote mode. type SubServer interface { macaroons.MacaroonValidator diff --git a/subservers/manager.go b/subservers/manager.go index 8d9f6b1b3..de0d43c14 100644 --- a/subservers/manager.go +++ b/subservers/manager.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "io/ioutil" + "sort" "sync" "time" @@ -11,6 +12,7 @@ import ( "github.com/lightninglabs/lightning-terminal/perms" "github.com/lightninglabs/lightning-terminal/status" "github.com/lightninglabs/lndclient" + "github.com/lightningnetwork/lnd/fn" "github.com/lightningnetwork/lnd/lncfg" "github.com/lightningnetwork/lnd/lnrpc" grpcProxy "github.com/mwitkow/grpc-proxy/proxy" @@ -29,6 +31,11 @@ var ( // defaultConnectTimeout is the default timeout for connecting to the // backend. defaultConnectTimeout = 15 * time.Second + + // criticalIntegratedSubServers lists integrated sub-servers that must + // succeed during startup. Failures from these sub-servers are surfaced + // to LiT and abort the startup sequence. + criticalIntegratedSubServers = fn.NewSet[string](TAP) ) // Manager manages a set of subServer objects. @@ -104,14 +111,37 @@ func (s *Manager) GetServer(name string) (SubServer, bool) { } // StartIntegratedServers starts all the manager's sub-servers that should be -// started in integrated mode. +// started in integrated mode. An error is returned if any critical integrated +// sub-server fails to start. func (s *Manager) StartIntegratedServers(lndClient lnrpc.LightningClient, - lndGrpc *lndclient.GrpcLndServices, withMacaroonService bool) { + lndGrpc *lndclient.GrpcLndServices, withMacaroonService bool) error { s.mu.Lock() defer s.mu.Unlock() + // Sort for deterministic startup: critical integrated sub-servers + // first, then alphabetical to keep the order stable across runs. + servers := make([]*subServerWrapper, 0, len(s.servers)) for _, ss := range s.servers { + servers = append(servers, ss) + } + + sort.Slice(servers, func(i, j int) bool { + iCritical := criticalIntegratedSubServers.Contains( + servers[i].Name(), + ) + jCritical := criticalIntegratedSubServers.Contains( + servers[j].Name(), + ) + + if iCritical != jCritical { + return iCritical + } + + return servers[i].Name() < servers[j].Name() + }) + + for _, ss := range servers { if ss.Remote() { continue } @@ -126,11 +156,18 @@ func (s *Manager) StartIntegratedServers(lndClient lnrpc.LightningClient, ) if err != nil { s.statusServer.SetErrored(ss.Name(), err.Error()) + + if criticalIntegratedSubServers.Contains(ss.Name()) { + return fmt.Errorf("%s: %v", ss.Name(), err) + } + continue } s.statusServer.SetRunning(ss.Name()) } + + return nil } // ConnectRemoteSubServers creates connections to all the manager's sub-servers diff --git a/subservers/manager_test.go b/subservers/manager_test.go new file mode 100644 index 000000000..279e68bed --- /dev/null +++ b/subservers/manager_test.go @@ -0,0 +1,189 @@ +package subservers + +import ( + "context" + "errors" + "testing" + + restProxy "github.com/grpc-ecosystem/grpc-gateway/v2/runtime" + "github.com/lightninglabs/lightning-terminal/litrpc" + "github.com/lightninglabs/lightning-terminal/perms" + "github.com/lightninglabs/lightning-terminal/status" + "github.com/lightninglabs/lndclient" + tafn "github.com/lightninglabs/taproot-assets/fn" + "github.com/lightningnetwork/lnd/lnrpc" + "github.com/stretchr/testify/require" + "google.golang.org/grpc" + "gopkg.in/macaroon-bakery.v2/bakery" +) + +// mockSubServer is a lightweight SubServer test double. +type mockSubServer struct { + // name is returned by Name(). + name string + + // remote toggles Remote() return value. + remote bool + + // startErr is returned from Start() when set. + startErr error + + // started tracks whether Start() succeeded. + started bool +} + +// Name returns the mock sub-server name. +func (t *mockSubServer) Name() string { + return t.name +} + +// Remote indicates whether the sub-server runs remotely. +func (t *mockSubServer) Remote() bool { + return t.remote +} + +// RemoteConfig returns nil for the mock. +func (t *mockSubServer) RemoteConfig() *RemoteDaemonConfig { + return nil +} + +// Start marks the server started unless startErr is set. +func (t *mockSubServer) Start(_ lnrpc.LightningClient, + _ *lndclient.GrpcLndServices, _ bool) error { + + if t.startErr != nil { + return t.startErr + } + + t.started = true + + return nil +} + +// Stop marks the server as stopped. +func (t *mockSubServer) Stop() error { + t.started = false + return nil +} + +// RegisterGrpcService is a no-op for the mock. +func (t *mockSubServer) RegisterGrpcService(_ grpc.ServiceRegistrar) {} + +// RegisterRestService is a no-op for the mock. +func (t *mockSubServer) RegisterRestService(_ context.Context, + _ *restProxy.ServeMux, _ string, _ []grpc.DialOption) error { + + return nil +} + +// ServerErrChan returns nil for the mock. +func (t *mockSubServer) ServerErrChan() chan error { + return nil +} + +// MacPath returns an empty string for the mock. +func (t *mockSubServer) MacPath() string { + return "" +} + +// Permissions returns nil for the mock. +func (t *mockSubServer) Permissions() map[string][]bakery.Op { + return nil +} + +// WhiteListedURLs returns nil for the mock. +func (t *mockSubServer) WhiteListedURLs() map[string]struct{} { + return nil +} + +// Impl returns an empty option for the mock. +func (t *mockSubServer) Impl() tafn.Option[any] { + return tafn.None[any]() +} + +// ValidateMacaroon always succeeds for the mock. +func (t *mockSubServer) ValidateMacaroon(context.Context, + []bakery.Op, string) error { + + return nil +} + +// newTestManager creates a Manager and status Manager with permissive perms. +func newTestManager(t *testing.T) (*Manager, *status.Manager) { + t.Helper() + + permsMgr, err := perms.NewManager(true) + require.NoError(t, err) + + statusMgr := status.NewStatusManager() + + return NewManager(permsMgr, statusMgr), statusMgr +} + +// TestStartIntegratedServersCriticalFailureStopsStartup ensures critical +// startup errors abort integrated startup. +func TestStartIntegratedServersCriticalFailureStopsStartup(t *testing.T) { + manager, statusMgr := newTestManager(t) + + nonCritical := &mockSubServer{name: "loop"} + critical := &mockSubServer{ + name: TAP, + startErr: errors.New("boom"), + } + + require.NoError(t, manager.AddServer(nonCritical, true)) + require.NoError(t, manager.AddServer(critical, true)) + + err := manager.StartIntegratedServers(nil, nil, true) + require.Error(t, err) + require.Contains(t, err.Error(), TAP) + + resp, err := statusMgr.SubServerStatus( + context.Background(), &litrpc.SubServerStatusReq{}, + ) + require.NoError(t, err) + + statuses := resp.SubServers + require.Contains(t, statuses, TAP) + require.Equal(t, "boom", statuses[TAP].Error) + require.False(t, statuses[TAP].Running) + + require.False( + t, nonCritical.started, "non-critical sub-server should not "+ + "start after critical failure", + ) +} + +// TestStartIntegratedServersNonCriticalFailureContinues verifies non-critical +// startup failures are tolerated. +func TestStartIntegratedServersNonCriticalFailureContinues(t *testing.T) { + manager, statusMgr := newTestManager(t) + + failing := &mockSubServer{ + name: "loop", + startErr: errors.New("start failed"), + } + succeeding := &mockSubServer{name: "pool"} + + require.NoError(t, manager.AddServer(failing, true)) + require.NoError(t, manager.AddServer(succeeding, true)) + + err := manager.StartIntegratedServers(nil, nil, true) + require.NoError(t, err) + + resp, err := statusMgr.SubServerStatus( + context.Background(), &litrpc.SubServerStatusReq{}, + ) + require.NoError(t, err) + + statuses := resp.SubServers + + require.Contains(t, statuses, failing.name) + require.Equal(t, "start failed", statuses[failing.name].Error) + require.False(t, statuses[failing.name].Running) + + require.True(t, succeeding.started) + require.Contains(t, statuses, succeeding.name) + require.True(t, statuses[succeeding.name].Running) + require.Empty(t, statuses[succeeding.name].Error) +} diff --git a/terminal.go b/terminal.go index 498be4697..c8317978e 100644 --- a/terminal.go +++ b/terminal.go @@ -405,6 +405,31 @@ func (g *LightningTerminal) Run(ctx context.Context) error { g.statusMgr.SetErrored( subservers.LIT, "could not start Lit: %v", startErr, ) + + // Stop lnd promptly to avoid it continuing to run after a + // failed startup (for example, calling into failed + // sub-servers). + if client, err := g.basicLNDClient(); err == nil { + stopCtx, cancel := context.WithTimeout( + ctx, 5*time.Second, + ) + defer cancel() + + _, err := client.StopDaemon( + stopCtx, &lnrpc.StopRequest{}, + ) + if err != nil { + log.Warnf("Error stopping lnd after failed "+ + "start: %v", err) + } + } + + if err := g.shutdownSubServers(); err != nil { + log.Errorf("Error shutting down after failed start: %v", + err) + } + + return startErr } // Now block until we receive an error or the main shutdown @@ -422,11 +447,10 @@ func (g *LightningTerminal) Run(ctx context.Context) error { return startErr } -// start attempts to start all the various components of Litd. Only Litd and -// LND errors are considered fatal and will result in an error being returned. -// If any of the sub-servers managed by the subServerMgr error while starting -// up, these are considered non-fatal and will not result in an error being -// returned. +// start attempts to start all the various components of Litd. LND errors and +// critical integrated sub-server failures are fatal and will result in an +// error being returned. Non-critical sub-server startup failures are recorded +// in the status manager but do not stop startup. func (g *LightningTerminal) start(ctx context.Context) error { var err error @@ -769,9 +793,13 @@ func (g *LightningTerminal) start(ctx context.Context) error { // Both connection types are ready now, let's start our sub-servers if // they should be started locally as an integrated service. createDefaultMacaroons := !g.cfg.statelessInitMode - g.subServerMgr.StartIntegratedServers( + err = g.subServerMgr.StartIntegratedServers( g.basicClient, g.lndClient, createDefaultMacaroons, ) + if err != nil { + return fmt.Errorf("could not start integrated sub-servers: %w", + err) + } err = g.startInternalSubServers(ctx, !g.cfg.statelessInitMode) if err != nil {