quixio · daniil-quix · Nov 6, 2024 · Nov 5, 2024 · Nov 6, 2024 · Nov 6, 2024
diff --git a/quixstreams/sources/base/source.py b/quixstreams/sources/base/source.py
@@ -182,8 +182,8 @@ def main():
 
     def __init__(self, name: str, shutdown_timeout: float = 10) -> None:
         """
-        :param name: The source unique name. Used to generate the topic configurtion
-        :param shutdown_timeout: Time in second the application waits for the source to gracefully shutdown
+        :param name: The source unique name. It is used to generate the topic configuration.
+        :param shutdown_timeout: Time in second the application waits for the source to gracefully shutdown.
         """
         super().__init__()
 

diff --git a/quixstreams/sources/core/csv.py b/quixstreams/sources/core/csv.py
@@ -1,74 +1,81 @@
 import csv
-import json
-from typing import Any, Callable, Optional
+import logging
+import time
+from pathlib import Path
+from typing import AnyStr, Callable, Optional, Union
 
 from quixstreams.models.topics import Topic
 from quixstreams.sources.base import Source
 
+logger = logging.getLogger(__name__)
+
 
 class CSVSource(Source):
     def __init__(
         self,
-        path: str,
-        dialect: str = "excel",
-        name: Optional[str] = None,
+        name: str,
+        path: Union[str, Path],
+        key_extractor: Optional[Callable[[dict], AnyStr]] = None,
+        timestamp_extractor: Optional[Callable[[dict], int]] = None,
+        delay: float = 0,
         shutdown_timeout: float = 10,
-        key_deserializer: Callable[[Any], str] = str,
-        value_deserializer: Callable[[Any], str] = json.loads,
+        dialect: str = "excel",
     ) -> None:
         """
-        A base CSV source that reads data from a single CSV file.
-        Best used with `quixstreams.sinks.csv.CSVSink`.
-
-        Required columns: key, value
-        Optional columns: timestamp
+        A base CSV source that reads data from a CSV file and produces rows
+        to the Kafka topic in JSON format.
 
+        :param name: The source unique name. Used to generate the topic configuration.
         :param path: path to the CSV file
+        :param key_extractor: an optional callable to extract the message key from the row.
+            It must return either `str` or `bytes`.
+            If empty, the Kafka messages will be produced without keys.
+            Default - `None`.
+        :param timestamp_extractor: an optional callable to extract the message timestamp from the row.
+            It must return time in milliseconds as `int`.
+            If empty, the current epoch will be used.
+            Default - `None`
+        :param shutdown_timeout: Time in second the application waits for the source to gracefully shut down.
         :param dialect: a CSV dialect to use. It affects quoting and delimiters.
             See the ["csv" module docs](https://docs.python.org/3/library/csv.html#csv-fmt-params) for more info.
             Default - `"excel"`.
-        :param key_deseralizer: a callable to convert strings to key.
-            Default - `str`
-        :param value_deserializer: a callable to convert strings to value.
-            Default - `json.loads`
         """
-        super().__init__(name or path, shutdown_timeout)
         self.path = path
+        self.delay = delay
         self.dialect = dialect
 
-        self._key_deserializer = key_deserializer
-        self._value_deserializer = value_deserializer
+        self.key_extractor = key_extractor
+        self.timestamp_extractor = timestamp_extractor
 
-    def run(self):
-        key_deserializer = self._key_deserializer
-        value_deserializer = self._value_deserializer
+        super().__init__(name=name, shutdown_timeout=shutdown_timeout)
 
+    def run(self):
+        # Start reading the file
         with open(self.path, "r") as f:
+            logger.info(f'Producing data from the file "{self.path}"')
             reader = csv.DictReader(f, dialect=self.dialect)
 
             while self.running:
                 try:
-                    item = next(reader)
+                    row = next(reader)
                 except StopIteration:
                     return
 
-                # if a timestamp column exist with no value timestamp is ""
-                timestamp = item.get("timestamp") or None
-                if timestamp is not None:
-                    timestamp = int(timestamp)
-
-                msg = self.serialize(
-                    key=key_deserializer(item["key"]),
-                    value=value_deserializer(item["value"]),
-                    timestamp_ms=timestamp,
+                # Extract message key from the row
+                message_key = self.key_extractor(row) if self.key_extractor else None
+                # Extract timestamp from the row
+                timestamp = (
+                    self.timestamp_extractor(row) if self.timestamp_extractor else None
                 )
+                # Serialize data before sending to Kafka
+                msg = self.serialize(key=message_key, value=row, timestamp_ms=timestamp)
 
-                self.produce(
-                    key=msg.key,
-                    value=msg.value,
-                    timestamp=msg.timestamp,
-                    headers=msg.headers,
-                )
+                # Publish the data to the topic
+                self.produce(timestamp=msg.timestamp, key=msg.key, value=msg.value)
+
+                # If the delay is specified, sleep before producing the next row
+                if self.delay > 0:
+                    time.sleep(self.delay)
 
     def default_topic(self) -> Topic:
         return Topic(

diff --git a/tests/test_quixstreams/test_sources/test_core/test_csv.py b/tests/test_quixstreams/test_sources/test_core/test_csv.py
@@ -1,5 +1,4 @@
 import csv
-import json
 from unittest.mock import MagicMock
 
 import pytest
@@ -19,24 +18,26 @@ def test_read(self, tmp_path, producer):
         path = tmp_path / "source.csv"
         with open(path, "w") as f:
             writer = csv.DictWriter(
-                f, dialect="excel", fieldnames=("key", "value", "timestamp")
+                f, dialect="excel", fieldnames=("key", "field", "timestamp")
             )
             writer.writeheader()
             writer.writerows(
                 [
-                    {"key": "key1", "value": json.dumps({"value": "value1"})},
-                    {"key": "key2", "value": json.dumps({"value": "value2"})},
-                    {"key": "key3", "value": json.dumps({"value": "value3"})},
-                    {"key": "key4", "value": json.dumps({"value": "value4"})},
-                    {
-                        "key": "key5",
-                        "value": json.dumps({"value": "value5"}),
-                        "timestamp": 10000,
-                    },
+                    {"key": "key1", "field": "value1", "timestamp": 1},
+                    {"key": "key2", "field": "value2", "timestamp": 2},
+                    {"key": "key3", "field": "value3", "timestamp": 3},
+                    {"key": "key4", "field": "value4", "timestamp": 4},
+                    {"key": "key5", "field": "value5", "timestamp": 5},
                 ]
             )
 
-        source = CSVSource(path)
+        name = "csv"
+        source = CSVSource(
+            name=name,
+            path=path,
+            key_extractor=lambda r: r["key"],
+            timestamp_extractor=lambda r: int(r["timestamp"]),
+        )
         source.configure(source.default_topic(), producer)
         source.start()
 
@@ -48,27 +49,30 @@ def test_read(self, tmp_path, producer):
             "key": b"key5",
             "partition": None,
             "poll_timeout": 5.0,
-            "timestamp": 10000,
-            "topic": path,
-            "value": b'{"value":"value5"}',
+            "timestamp": 5,
+            "topic": name,
+            "value": b'{"key":"key5","field":"value5","timestamp":"5"}',
         }
 
-    def test_read_no_timestamp(self, tmp_path, producer):
+    def test_read_no_extractors(self, tmp_path, producer):
         path = tmp_path / "source.csv"
         with open(path, "w") as f:
-            writer = csv.DictWriter(f, dialect="excel", fieldnames=("key", "value"))
+            writer = csv.DictWriter(
+                f, dialect="excel", fieldnames=("key", "field", "timestamp")
+            )
             writer.writeheader()
             writer.writerows(
                 [
-                    {"key": "key1", "value": json.dumps({"value": "value1"})},
-                    {"key": "key2", "value": json.dumps({"value": "value2"})},
-                    {"key": "key3", "value": json.dumps({"value": "value3"})},
-                    {"key": "key4", "value": json.dumps({"value": "value4"})},
-                    {"key": "key5", "value": json.dumps({"value": "value5"})},
+                    {"key": "key1", "field": "value1", "timestamp": 1},
+                    {"key": "key2", "field": "value2", "timestamp": 2},
+                    {"key": "key3", "field": "value3", "timestamp": 3},
+                    {"key": "key4", "field": "value4", "timestamp": 4},
+                    {"key": "key5", "field": "value5", "timestamp": 5},
                 ]
             )
 
-        source = CSVSource(path)
+        name = "csv"
+        source = CSVSource(name="csv", path=path)
         source.configure(source.default_topic(), producer)
         source.start()
 
@@ -77,10 +81,10 @@ def test_read_no_timestamp(self, tmp_path, producer):
         assert producer.produce.call_args.kwargs == {
             "buffer_error_max_tries": 3,
             "headers": None,
-            "key": b"key5",
+            "key": None,
             "partition": None,
             "poll_timeout": 5.0,
             "timestamp": None,
-            "topic": path,
-            "value": b'{"value":"value5"}',
+            "topic": name,
+            "value": b'{"key":"key5","field":"value5","timestamp":"5"}',
         }