Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion deepmd/pd/train/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -1038,7 +1038,11 @@ def log_loss_valid(_task_key: str = "Default") -> dict:
if JIT:
break

if self.change_bias_after_training and (self.rank == 0 or dist.get_rank() == 0):
if (
self.change_bias_after_training
and self.num_steps > self.start_step
and (self.rank == 0 or dist.get_rank() == 0)
):
if not self.multi_task:
self.model = model_change_out_bias(
self.model,
Expand Down
6 changes: 5 additions & 1 deletion deepmd/pt/train/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -1745,7 +1745,11 @@ def log_loss_valid(_task_key: str = "Default") -> dict:
if JIT:
break

if self.change_bias_after_training and (self.rank == 0 or dist.get_rank() == 0):
if (
self.change_bias_after_training
and self.num_steps > self.start_step
and (self.rank == 0 or dist.get_rank() == 0)
):
if not self.multi_task:
self.model = model_change_out_bias(
self.model,
Expand Down
31 changes: 31 additions & 0 deletions source/tests/pd/test_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,12 @@
from pathlib import (
Path,
)
from unittest.mock import (
patch,
)

import numpy as np
import paddle

from deepmd.pd.entrypoints.main import (
get_trainer,
Expand Down Expand Up @@ -163,6 +167,33 @@ def setUp(self) -> None:
self.config["training"]["save_freq"] = 1
enable_prim(True)

@patch("deepmd.pd.train.training.model_change_out_bias")
def test_zero_step_with_change_bias_saves_initial_checkpoint(
self, mocked_change_out_bias
) -> None:
def keep_model(model, *_args, **_kwargs):
return model

mocked_change_out_bias.side_effect = keep_model
config = deepcopy(self.config)
config["training"]["numb_steps"] = 0
config["training"]["change_bias_after_training"] = True
trainer = get_trainer(config)
trainer.run()

expected_model = Path(trainer.save_ckpt + "-0.pd")
self.assertEqual(expected_model, trainer.latest_model)
self.assertTrue(expected_model.exists())
self.assertEqual(
expected_model,
Path(Path("checkpoint").read_text().strip()),
)
checkpoint = paddle.load(str(expected_model))
train_infos = checkpoint["model"]["_extra_state"]["train_infos"]
self.assertEqual(0, train_infos["step"])
self.assertEqual(0.0, train_infos["lr"])
Comment on lines +193 to +194

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same issue as the PT test: this passes on the unfixed code. With numb_steps=0, the pre-existing if self.num_steps == 0: block re-saves <ckpt>-0.pd with step=0, lr=0 and rewrites the checkpoint pointer after the bias block, so the assertions here (existence, latest_model, pointer, step==0, lr==0.0) are satisfied regardless of the num_steps > start_step guard. The only thing the fix changes — whether model_change_out_bias runs — is never asserted.

Suggest patching model_change_out_bias and asserting it is not called for numb_steps=0. Note the PD suite also has no test exercising the true branch (bias adjustment running for numb_steps>0), unlike PT's test_ema_checkpoint_keeps_changed_out_bias.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 1273e6e. The Paddle zero-step regression test now patches deepmd.pd.train.training.model_change_out_bias, returns the original model if it is called, and asserts assert_not_called() after trainer.run(). This directly checks the behavior changed by the guard instead of only checking the checkpoint rewrite path.

Validation:

  • uvx ruff check .
  • uvx ruff format --check .

I could not run the Paddle test locally because this environment is missing paddle (ModuleNotFoundError: No module named paddle).

mocked_change_out_bias.assert_not_called()

def tearDown(self) -> None:
DPTrainTest.tearDown(self)

Expand Down
27 changes: 27 additions & 0 deletions source/tests/pt/test_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,33 @@ def test_yaml_input(self) -> None:
)
self.assertTrue(Path("out.json").exists())

@patch("deepmd.pt.train.training.model_change_out_bias")
def test_zero_step_with_change_bias_saves_initial_checkpoint(
self, mocked_change_out_bias
) -> None:
def keep_model(model, *_args, **_kwargs):
return model

mocked_change_out_bias.side_effect = keep_model
config = deepcopy(self.config)
config["training"]["numb_steps"] = 0
config["training"]["change_bias_after_training"] = True
trainer = get_trainer(config)
trainer.run()

expected_model = Path(trainer.save_ckpt + "-0.pt")
self.assertEqual(expected_model, trainer.latest_model)
self.assertTrue(expected_model.exists())
self.assertEqual(
expected_model,
Path(Path("checkpoint").read_text().strip()),
)
checkpoint = torch.load(expected_model, map_location="cpu", weights_only=True)
train_infos = checkpoint["model"]["_extra_state"]["train_infos"]
self.assertEqual(0, train_infos["step"])
self.assertEqual(0.0, train_infos["lr"])
Comment on lines +289 to +290

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This regression test passes on the unfixed code, so it does not guard the bug it targets. With numb_steps=0, the pre-existing if self.num_steps == 0: block re-saves <ckpt>-0.pt with step=0, lr=0 and rewrites the checkpoint pointer after the bias block runs. So every assertion here — file exists, latest_model == -0, pointer content, train_infos["step"]==0, train_infos["lr"]==0.0 — holds whether or not the num_steps > start_step guard is applied.

The only behavior the fix actually changes is whether model_change_out_bias mutates the output bias, which this test never checks. Verified: reverting the guard in training.py while keeping this test still yields 1 passed.

Suggest mirroring test_ema_checkpoint_keeps_changed_out_bias — patch model_change_out_bias and assert it is not called (or assert out_bias is unchanged) when numb_steps=0. That makes it a genuine regression test.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 1273e6e. The PT zero-step regression test now patches deepmd.pt.train.training.model_change_out_bias, returns the original model if it is called, and asserts assert_not_called() after trainer.run(). This makes the test fail if the num_steps > start_step guard is removed, while keeping the existing checkpoint metadata assertions.

Validation:

  • pytest source/tests/pt/test_training.py::TestEnergyModelSeA::test_zero_step_with_change_bias_saves_initial_checkpoint -v
  • uvx ruff check .
  • uvx ruff format --check .

mocked_change_out_bias.assert_not_called()

def tearDown(self) -> None:
DPTrainTest.tearDown(self)
for ff in ["out.json", "input.yaml"]:
Expand Down
Loading