@@ -34,6 +34,8 @@ type ExecutionMode string
34
34
const (
35
35
SubprocessExecutionMode ExecutionMode = "subprocess"
36
36
EmbeddedExecutionMode ExecutionMode = "embedded"
37
+ // waitTimeForStop is the time to wait for the collector to stop before killing it.
38
+ waitTimeForStop = 30 * time .Second
37
39
)
38
40
39
41
type collectorRecoveryTimer interface {
@@ -101,6 +103,9 @@ type OTelManager struct {
101
103
execution collectorExecution
102
104
103
105
proc collectorHandle
106
+
107
+ // collectorRunErr is used to signal that the collector has exited.
108
+ collectorRunErr chan error
104
109
}
105
110
106
111
// NewOTelManager returns a OTelManager.
@@ -131,7 +136,7 @@ func NewOTelManager(
131
136
recoveryTimer = newRestarterNoop ()
132
137
exec = newExecutionEmbedded ()
133
138
default :
134
- return nil , errors .New ("unknown otel collector exec " )
139
+ return nil , errors .New ("unknown otel collector execModeFn " )
135
140
}
136
141
137
142
logger .Debugf ("Using collector execution mode: %s" , mode )
@@ -144,10 +149,11 @@ func NewOTelManager(
144
149
errCh : make (chan error , 1 ), // holds at most one error
145
150
collectorStatusCh : make (chan * status.AggregateStatus , 1 ),
146
151
componentStateCh : make (chan []runtime.ComponentComponentState , 1 ),
147
- updateCh : make (chan configUpdate ),
152
+ updateCh : make (chan configUpdate , 1 ),
148
153
doneChan : make (chan struct {}),
149
154
execution : exec ,
150
155
recoveryTimer : recoveryTimer ,
156
+ collectorRunErr : make (chan error ),
151
157
}, nil
152
158
}
153
159
@@ -156,24 +162,21 @@ func (m *OTelManager) Run(ctx context.Context) error {
156
162
var err error
157
163
m .proc = nil
158
164
159
- // signal that the run loop is ended to unblock any incoming update calls
160
- defer close (m .doneChan )
161
-
162
- // collectorRunErr is used to signal that the collector has exited.
163
- collectorRunErr := make (chan error )
164
-
165
165
// collectorStatusCh is used internally by the otel collector to send status updates to the manager
166
166
// this channel is buffered because it's possible for the collector to send a status update while the manager is
167
167
// waiting for the collector to exit
168
168
collectorStatusCh := make (chan * status.AggregateStatus , 1 )
169
169
for {
170
170
select {
171
171
case <- ctx .Done ():
172
+ // signal that the run loop is ended to unblock any incoming update calls
173
+ close (m .doneChan )
174
+
172
175
m .recoveryTimer .Stop ()
173
176
// our caller context is cancelled so stop the collector and return
174
177
// has exited.
175
178
if m .proc != nil {
176
- m .proc .Stop (ctx )
179
+ m .proc .Stop (waitTimeForStop )
177
180
}
178
181
return ctx .Err ()
179
182
case <- m .recoveryTimer .C ():
@@ -187,7 +190,7 @@ func (m *OTelManager) Run(ctx context.Context) error {
187
190
188
191
newRetries := m .recoveryRetries .Add (1 )
189
192
m .logger .Infof ("collector recovery restarting, total retries: %d" , newRetries )
190
- m .proc , err = m .execution .startCollector (ctx , m .baseLogger , m .mergedCollectorCfg , collectorRunErr , collectorStatusCh )
193
+ m .proc , err = m .execution .startCollector (ctx , m .baseLogger , m .mergedCollectorCfg , m . collectorRunErr , collectorStatusCh )
191
194
if err != nil {
192
195
reportErr (ctx , m .errCh , err )
193
196
// reset the restart timer to the next backoff
@@ -197,12 +200,12 @@ func (m *OTelManager) Run(ctx context.Context) error {
197
200
reportErr (ctx , m .errCh , nil )
198
201
}
199
202
200
- case err = <- collectorRunErr :
203
+ case err = <- m . collectorRunErr :
201
204
m .recoveryTimer .Stop ()
202
205
if err == nil {
203
206
// err is nil means that the collector has exited cleanly without an error
204
207
if m .proc != nil {
205
- m .proc .Stop (ctx )
208
+ m .proc .Stop (waitTimeForStop )
206
209
m .proc = nil
207
210
updateErr := m .reportOtelStatusUpdate (ctx , nil )
208
211
if updateErr != nil {
@@ -223,7 +226,7 @@ func (m *OTelManager) Run(ctx context.Context) error {
223
226
224
227
// in this rare case the collector stopped running but a configuration was
225
228
// provided and the collector stopped with a clean exit
226
- m .proc , err = m .execution .startCollector (ctx , m .baseLogger , m .mergedCollectorCfg , collectorRunErr , collectorStatusCh )
229
+ m .proc , err = m .execution .startCollector (ctx , m .baseLogger , m .mergedCollectorCfg , m . collectorRunErr , collectorStatusCh )
227
230
if err != nil {
228
231
// failed to create the collector (this is different then
229
232
// it's failing to run). we do not retry creation on failure
@@ -245,7 +248,7 @@ func (m *OTelManager) Run(ctx context.Context) error {
245
248
// in the case that the configuration is invalid there is no reason to
246
249
// try again as it will keep failing so we do not trigger a restart
247
250
if m .proc != nil {
248
- m .proc .Stop (ctx )
251
+ m .proc .Stop (waitTimeForStop )
249
252
m .proc = nil
250
253
// don't wait here for <-collectorRunErr, already occurred
251
254
// clear status, no longer running
@@ -281,7 +284,7 @@ func (m *OTelManager) Run(ctx context.Context) error {
281
284
m .components = cfgUpdate .components
282
285
m .mx .Unlock ()
283
286
284
- err = m .applyMergedConfig (ctx , collectorStatusCh , collectorRunErr )
287
+ err = m .applyMergedConfig (ctx , collectorStatusCh , m . collectorRunErr )
285
288
// report the error unconditionally to indicate that the config was applied
286
289
reportErr (ctx , m .errCh , err )
287
290
@@ -340,7 +343,7 @@ func buildMergedConfig(cfgUpdate configUpdate, agentInfo info.Agent, monitoringC
340
343
341
344
func (m * OTelManager ) applyMergedConfig (ctx context.Context , collectorStatusCh chan * status.AggregateStatus , collectorRunErr chan error ) error {
342
345
if m .proc != nil {
343
- m .proc .Stop (ctx )
346
+ m .proc .Stop (waitTimeForStop )
344
347
m .proc = nil
345
348
select {
346
349
case <- collectorRunErr :
@@ -402,6 +405,15 @@ func (m *OTelManager) Update(cfg *confmap.Conf, components []component.Component
402
405
collectorCfg : cfg ,
403
406
components : components ,
404
407
}
408
+
409
+ // we care only about the latest config update
410
+ select {
411
+ case <- m .updateCh :
412
+ case <- m .doneChan :
413
+ return
414
+ default :
415
+ }
416
+
405
417
select {
406
418
case m .updateCh <- cfgUpdate :
407
419
case <- m .doneChan :
0 commit comments