File tree Expand file tree Collapse file tree 2 files changed +20
-1
lines changed Expand file tree Collapse file tree 2 files changed +20
-1
lines changed Original file line number Diff line number Diff line change 7
7
from .backend import backend_name , tf , torch , paddle
8
8
from .real import Real
9
9
10
+ # Data parallel
11
+ parallel_scaling = None
10
12
# Data parallel via Horovod
11
13
hvd = None
12
14
comm = None
21
23
if world_size > 1 :
22
24
from mpi4py import MPI
23
25
26
+ parallel_scaling = "weak"
24
27
comm = MPI .COMM_WORLD
25
28
tf .compat .v1 .disable_eager_execution () # Without this line, Horovod broadcasting fails.
26
- rank = hvd .rank () # Only single node acceleration supported so far.
29
+ rank = hvd .rank () # Only single node acceleration supported so far.
27
30
if rank == 0 :
28
31
print (f"\n Parallel training with { world_size } processes.\n " )
29
32
else :
@@ -198,3 +201,15 @@ def disable_xla_jit():
198
201
This is equivalent with ``enable_xla_jit(False)``.
199
202
"""
200
203
enable_xla_jit (False )
204
+
205
+
206
+ def set_parallel_scaling (scaling_mode ):
207
+ """Sets the scaling mode for data parallel acceleration.
208
+ Weak scaling involves increasing the problem size proportionally with the number of processors,
209
+ while strong scaling involves keeping the problem size fixed and increasing the number of processors.
210
+
211
+ Args:
212
+ scaling_mode (str): Whether 'weak' or 'strong'
213
+ """
214
+ global parallel_scaling
215
+ parallel_scaling = scaling_mode
Original file line number Diff line number Diff line change @@ -100,6 +100,10 @@ def __init__(
100
100
raise ValueError (
101
101
"Parallel training via Horovod only supports pseudo train distribution."
102
102
)
103
+ if config .parallel_scaling == "strong" :
104
+ raise ValueError (
105
+ "Strong scaling is not supported with tensorflow.compat.v1. Please use weak scaling."
106
+ )
103
107
self .anchors = None if anchors is None else anchors .astype (config .real (np ))
104
108
self .exclusions = exclusions
105
109
You can’t perform that action at this time.
0 commit comments