Update sensory agents related to the endpoints changes. (#108)

nqyy · web-flow · commit 60a0570a067e · 2025-04-15T20:31:16.000-04:00
diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml
@@ -17,7 +17,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-20.04, ubuntu-latest]
+        os: [ubuntu-latest]
 
     steps:
       - name: Checkout
diff --git a/README.md b/README.md
@@ -148,7 +148,6 @@ _Embodied Agents are not yet capable of learning from in-context experience_:
 
 - [OpenVLA](https://api.mbodi.ai/community-models/)
 - [Sensory Tools](https://api.mbodi.ai/sense/)
-- [Embodied AI Playground](https://api.mbodi.ai/benchmark/)
 
 ### Roadmap
 
@@ -157,6 +156,7 @@ _Embodied Agents are not yet capable of learning from in-context experience_:
 - [x] Yolo, SAM2, DepthAnything Sensory Agents
 - [x] Auto Agent
 - [x] Google Gemini Backend
+- [ ] Pi0 Motor Agent
 - [ ] ROS integration
 - [ ] More Motor Agents, i.e. RT1
 - [ ] More device support, i.e. OpenCV camera
diff --git a/mbodied/agents/cli.py b/mbodied/agents/cli.py
@@ -391,8 +391,9 @@ def estimate_depth(ctx, image_filename, model_src, api_name, list, help) -> None
     DepthEstimationAgent = smart_import("mbodied.agents.sense", attribute="DepthEstimationAgent")
     image = Image(path=image_filename, size=(224, 224))
     agent: "DepthEstimationAgent" = DepthEstimationAgent(model_src=model_src)
-    result = agent.act(image=image, api_name=api_name)
+    result, depth_array = agent.act(image=image, api_name=api_name)
     result.pil.show()
+    print("Depth array shape", depth_array.shape)
 
 
 @sense.command("segment")
diff --git a/mbodied/agents/sense/depth_estimation_agent.py b/mbodied/agents/sense/depth_estimation_agent.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import numpy as np
+
 from mbodied.agents.sense.sensory_agent import SensoryAgent
 from mbodied.types.sense.vision import Image
 
@@ -49,12 +51,13 @@ def act(self, image: Image, *args, api_name: str = "/depth", **kwargs) -> Image:
         """
         if self.actor is None:
             raise ValueError("Remote actor for agent not initialized.")
-        response = self.actor.predict(image.base64, *args, api_name=api_name, **kwargs)
-        return Image(response)
+        response, depth_file = self.actor.predict(image.base64, *args, api_name=api_name, **kwargs)
+        return Image(response), np.load(depth_file)
 
 
 # Example usage:
 if __name__ == "__main__":
     agent = DepthEstimationAgent(model_src="https://api.mbodi.ai/sense/")
-    result = agent.act(image=Image("resources/bridge_example.jpeg"))
+    result, depth_array = agent.act(image=Image("resources/bridge_example.jpeg"))
     result.pil.show()
+    print("Depth array shape", depth_array.shape)
diff --git a/mbodied/agents/sense/segmentation_agent.py b/mbodied/agents/sense/segmentation_agent.py
@@ -59,11 +59,11 @@ def act(
         else:
             raise ValueError("Unsupported input type. Must be BBox2D, List[BBox2D], or PixelCoords.")
 
-        segmented_image, masks = self.actor.predict(
+        segmented_image, masks_file = self.actor.predict(
             image.base64, input_type, input_data_str, *args, api_name=api_name, **kwargs
         )
         # Convert gradio Dataframe numpy to numpy array.
-        masks = np.array(masks["data"])
+        masks = np.load(masks_file)
         return Image(segmented_image), masks
 
 
diff --git a/tests/test_auto_agent.py b/tests/test_auto_agent.py
@@ -5,6 +5,19 @@
 from mbodied.agents.motion.openvla_agent import OpenVlaAgent
 from mbodied.agents.backends.gradio_backend import GradioBackend
 from mbodied.agents.auto.auto_agent import AutoAgent, get_agent
+import numpy as np
+import tempfile
+import base64
+from io import BytesIO
+from PIL import Image as PILImage
+
+
+def get_dummy_base64_image():
+    """Create a minimal valid base64 image for testing."""
+    img = PILImage.new("RGB", (10, 10), color="red")
+    buffered = BytesIO()
+    img.save(buffered, format="JPEG")
+    return base64.b64encode(buffered.getvalue()).decode()
 
 
 @pytest.fixture
@@ -56,22 +69,32 @@ def test_auto_openvla_agent_act(auto_openvla_agent_get_method):
 
 @pytest.fixture
 def mock_depth_gradio_backend():
+    # Create a temp file that would be a valid location for a numpy file
+    temp_file = tempfile.NamedTemporaryFile(suffix=".npy", delete=False)
+    temp_path = temp_file.name
+    temp_file.close()
+    dummy_image_b64 = get_dummy_base64_image()
+
     with patch.object(GradioBackend, "__init__", lambda x, model_src=None, **kwargs: None):
-        with patch.object(GradioBackend, "predict", return_value=Image(size=(224, 224))):
-            yield GradioBackend(endpoint="http://1.2.3.4:1234")
+        # Return a valid base64 image string and a path
+        with patch.object(GradioBackend, "predict", return_value=(dummy_image_b64, temp_path)):
+            # Intercept np.load calls to avoid actual file system access
+            with patch("numpy.load", return_value=np.zeros((224, 224))):
+                yield GradioBackend(endpoint="http://1.2.3.4:1234")
 
 
 @pytest.fixture
 def depth_agent(mock_depth_gradio_backend):
     agent = AutoAgent(task="sense-depth-estimation", model_src="http://1.2.3.4:1234/")
-    agent.actor = mock_openvla_gradio_backend
+    agent.actor = mock_depth_gradio_backend
     return agent
 
 
 def test_auto_depth_agent_act(depth_agent):
     mock_image = MagicMock(spec=Image)
     mock_image.base64 = "base64encodedimage"
 
-    result = depth_agent.act(mock_image)
+    result, depth_array = depth_agent.act(mock_image)
 
     assert isinstance(result, Image)
+    assert isinstance(depth_array, np.ndarray)
diff --git a/tests/test_depth_agent.py b/tests/test_depth_agent.py
@@ -1,16 +1,37 @@
 import pytest
 from unittest.mock import patch, MagicMock
+import numpy as np
+import tempfile
+import base64
+from io import BytesIO
+from PIL import Image as PILImage
 from mbodied.types.sense.vision import Image
 from mbodied.agents.sense.depth_estimation_agent import DepthEstimationAgent
-from mbodied.agents.sense.object_detection_agent import ObjectDetectionAgent
 from mbodied.agents.backends.gradio_backend import GradioBackend
 
 
+def get_dummy_base64_image():
+    """Create a minimal valid base64 image for testing."""
+    img = PILImage.new("RGB", (10, 10), color="red")
+    buffered = BytesIO()
+    img.save(buffered, format="JPEG")
+    return base64.b64encode(buffered.getvalue()).decode()
+
+
 @pytest.fixture
 def mock_gradio_backend():
+    # Create a temp file that would be a valid location for a numpy file
+    temp_file = tempfile.NamedTemporaryFile(suffix=".npy", delete=False)
+    temp_path = temp_file.name
+    temp_file.close()
+    dummy_image_b64 = get_dummy_base64_image()
+
     with patch.object(GradioBackend, "__init__", lambda x, model_src=None, **kwargs: None):
-        with patch.object(GradioBackend, "predict", return_value=Image(size=(224, 224))):
-            yield GradioBackend(endpoint="http://1.2.3.4:1234")
+        # Return a valid base64 image string and a path
+        with patch.object(GradioBackend, "predict", return_value=(dummy_image_b64, temp_path)):
+            # Intercept np.load calls to avoid actual file system access
+            with patch("numpy.load", return_value=np.zeros((224, 224))):
+                yield GradioBackend(endpoint="http://1.2.3.4:1234")
 
 
 @pytest.fixture
@@ -29,14 +50,16 @@ def test_depth_agent_act(depth_agent):
     mock_image = MagicMock(spec=Image)
     mock_image.base64 = "base64encodedimage"
 
-    result = depth_agent.act(mock_image)
+    result_img, depth_array = depth_agent.act(mock_image)
 
-    assert isinstance(result, Image)
+    assert isinstance(result_img, Image)
+    assert isinstance(depth_array, np.ndarray)
 
 
 @pytest.mark.network
 def test_real_depth_agent_act():
     # Make real network call.
     agent = DepthEstimationAgent(model_src="https://api.mbodi.ai/sense/")
-    result = agent.act(image=Image("resources/xarm.jpeg", size=(224, 224)))
-    assert isinstance(result, Image)
+    result_img, depth_array = agent.act(image=Image("resources/xarm.jpeg", size=(224, 224)))
+    assert isinstance(result_img, Image)
+    assert isinstance(depth_array, np.ndarray)
diff --git a/tests/test_segmentation_agent.py b/tests/test_segmentation_agent.py
@@ -1,23 +1,42 @@
 import pytest
 import numpy as np
+import tempfile
+import base64
+from io import BytesIO
+from PIL import Image as PILImage
 from unittest.mock import patch, MagicMock
 from mbodied.types.sense.world import BBox2D, PixelCoords
 from mbodied.types.sense.vision import Image
 from mbodied.agents.sense.segmentation_agent import SegmentationAgent
 
 
+def get_dummy_base64_image():
+    """Create a minimal valid base64 image for testing."""
+    img = PILImage.new("RGB", (10, 10), color="red")
+    buffered = BytesIO()
+    img.save(buffered, format="JPEG")
+    return base64.b64encode(buffered.getvalue()).decode()
+
+
 @pytest.fixture
 def mock_gradio_backend():
+    # Create a temp file that would be a valid location for a numpy file
+    temp_file = tempfile.NamedTemporaryFile(suffix=".npy", delete=False)
+    temp_path = temp_file.name
+    temp_file.close()
+    dummy_image_b64 = get_dummy_base64_image()
+
     with patch(
         "mbodied.agents.backends.gradio_backend.GradioBackend.__init__", lambda x, model_src=None, **kwargs: None
     ):
         with patch(
             "mbodied.agents.backends.gradio_backend.GradioBackend.predict",
-            return_value=(Image((224, 224)), {"data": [[0]]}),
+            return_value=(dummy_image_b64, temp_path),
         ):
-            from mbodied.agents.backends.gradio_backend import GradioBackend
+            with patch("numpy.load", return_value=np.zeros((224, 224, 1))):
+                from mbodied.agents.backends.gradio_backend import GradioBackend
 
-            yield GradioBackend(endpoint="http://1.2.3.4:1234")
+                yield GradioBackend(endpoint="http://1.2.3.4:1234")
 
 
 @pytest.fixture