succinctlabs · DeVikingMark · Jan 6, 2025
diff --git a/proposer/op/proposer/prove.go b/proposer/op/proposer/prove.go
@@ -17,6 +17,26 @@ import (
 )
 
 const PROOF_STATUS_TIMEOUT = 30 * time.Second
+const WITNESSGEN_TIMEOUT = 20 * time.Minute
+
+// Status constants
+const (
+	SP1FulfillmentStatusUnfulfillable = 2
+	SP1FulfillmentStatusFulfilled     = 1
+	SP1ExecutionStatusUnexecutable    = 2
+)
+
+// ProofStatusResponse represents the response from the prover service
+type ProofStatusResponse struct {
+	FulfillmentStatus int    `json:"fulfillment_status"`
+	ExecutionStatus   int    `json:"execution_status"`
+	Error            string  `json:"error"`
+	Proof            []byte  `json:"proof"`
+}
+
+// This limit is set to prevent overloading the witness generation server. Until Kona improves their native I/O API (https://github.com/anton-rs/kona/issues/553)
+// the maximum number of concurrent witness generation requests is roughly num_cpu / 2. Set it to 5 for now to be safe.
+const MAX_CONCURRENT_WITNESS_GEN = 5
 
 // Process all of requests in PROVING state.
 func (l *L2OutputSubmitter) ProcessProvingRequests() error {
@@ -70,7 +90,7 @@ func (l *L2OutputSubmitter) ProcessWitnessgenRequests() error {
 	for _, req := range reqs {
 		// If the request has been in the WITNESSGEN state for longer than the timeout, set status to FAILED.
 		// This is a catch-all in case the witness generation state update failed.
-		if req.LastUpdatedTime+uint64(l.Cfg.WitnessGenTimeout) < uint64(time.Now().Unix()) {
+		if req.LastUpdatedTime+uint64(WITNESSGEN_TIMEOUT.Seconds()) < uint64(time.Now().Unix()) {
 			// Retry the request if it timed out.
 			l.RetryRequest(req, ProofStatusResponse{})
 		}
@@ -94,18 +114,36 @@ func (l *L2OutputSubmitter) RetryRequest(req *ent.ProofRequest, status ProofStat
 	// This is likely caused by an SP1 OOM due to a large block range with many transactions.
 	// TODO: This solution can be removed once the embedded allocator is used, because then the programs
 	// will never OOM.
-	if req.Type == proofrequest.TypeSPAN && status.ExecutionStatus == SP1ExecutionStatusUnexecutable && req.EndBlock-req.StartBlock > 1 {
-		// Split the request into two requests.
-		midBlock := (req.StartBlock + req.EndBlock) / 2
-		err = l.db.NewEntry(req.Type, req.StartBlock, midBlock)
-		if err != nil {
-			l.Log.Error("failed to retry first half of proof request", "err", err)
-			return err
+	if req.Type == proofrequest.TypeSPAN && (status.ExecutionStatus == SP1ExecutionStatusUnexecutable || strings.Contains(status.Error, "CUDA")) && req.EndBlock-req.StartBlock > 1 {
+		// For CUDA errors, split into smaller chunks
+		var chunks []struct{ start, end uint64 }
+		chunkSize := uint64(1) // Start with smallest possible chunk
+
+		if strings.Contains(status.Error, "CUDA") {
+			// Use very small chunks for CUDA errors
+			for start := req.StartBlock; start < req.EndBlock; start += chunkSize {
+				end := start + chunkSize
+				if end > req.EndBlock {
+					end = req.EndBlock
+				}
+				chunks = append(chunks, struct{ start, end uint64 }{start, end})
+			}
+		} else {
+			// Original split in half behavior
+			midBlock := (req.StartBlock + req.EndBlock) / 2
+			chunks = []struct{ start, end uint64 }{
+				{req.StartBlock, midBlock},
+				{midBlock + 1, req.EndBlock},
+			}
 		}
-		err = l.db.NewEntry(req.Type, midBlock, req.EndBlock)
-		if err != nil {
-			l.Log.Error("failed to retry second half of proof request", "err", err)
-			return err
+
+		// Create new entries for each chunk
+		for _, chunk := range chunks {
+			err = l.db.NewEntry(req.Type, chunk.start, chunk.end)
+			if err != nil {
+				l.Log.Error("failed to create new chunk entry", "start", chunk.start, "end", chunk.end, "err", err)
+				return err
+			}
 		}
 	} else {
 		// Retry the same request.
@@ -157,7 +195,7 @@ func (l *L2OutputSubmitter) RequestQueuedProofs(ctx context.Context) error {
 
 		// The number of witness generation requests is capped at MAX_CONCURRENT_WITNESS_GEN. This prevents overloading the machine with processes spawned by the witness generation server.
 		// Once https://github.com/anton-rs/kona/issues/553 is fixed, we may be able to remove this check.
-		if witnessGenProofs >= int(l.Cfg.MaxConcurrentWitnessGen) {
+		if witnessGenProofs >= MAX_CONCURRENT_WITNESS_GEN {
 			l.Log.Info("max witness generation reached, waiting for next cycle")
 			return nil
 		}
@@ -326,14 +364,13 @@ func (l *L2OutputSubmitter) makeProofRequest(proofType proofrequest.Type, jsonBo
 	}
 	req.Header.Set("Content-Type", "application/json")
 
-	timeout := time.Duration(l.Cfg.WitnessGenTimeout) * time.Second
-	client := &http.Client{Timeout: timeout}
+	client := &http.Client{Timeout: WITNESSGEN_TIMEOUT}
 	resp, err := client.Do(req)
 	if err != nil {
 		if netErr, ok := err.(net.Error); ok && netErr.Timeout() {
 			l.Log.Error("Witness generation request timed out", "err", err)
 			l.Metr.RecordWitnessGenFailure("Timeout")
-			return nil, fmt.Errorf("request timed out after %s: %w", timeout, err)
+			return nil, fmt.Errorf("request timed out after %s: %w", WITNESSGEN_TIMEOUT, err)
 		}
 		return nil, fmt.Errorf("failed to send request: %w", err)
 	}