cpython/Lib/profiling/sampling/stack_collector.py

import base64
import collections
import functools
import importlib.resources
import json
import linecache
import os

from .collector import Collector


class StackTraceCollector(Collector):
    def __init__(self):
        self.call_trees = []
        self.function_samples = collections.defaultdict(int)

    def _process_frames(self, frames):
        """Process a single thread's frame stack."""
        if not frames:
            return

        # Store the complete call stack (reverse order - root first)
        call_tree = list(reversed(frames))
        self.call_trees.append(call_tree)

        # Count samples per function
        for frame in frames:
            self.function_samples[frame] += 1

    def collect(self, stack_frames):
        for frames in self._iter_all_frames(stack_frames):
            self._process_frames(frames)


class CollapsedStackCollector(StackTraceCollector):
    def export(self, filename):
        stack_counter = collections.Counter()
        for call_tree in self.call_trees:
            # Call tree is already in root->leaf order
            stack_str = ";".join(
                f"{os.path.basename(f[0])}:{f[2]}:{f[1]}" for f in call_tree
            )
            stack_counter[stack_str] += 1

        with open(filename, "w") as f:
            for stack, count in stack_counter.items():
                f.write(f"{stack} {count}\n")
        print(f"Collapsed stack output written to {filename}")


class FlamegraphCollector(StackTraceCollector):
    def __init__(self):
        super().__init__()
        self.stats = {}

    def set_stats(self, sample_interval_usec, duration_sec, sample_rate, error_rate=None):
        """Set profiling statistics to include in flamegraph data."""
        self.stats = {
            "sample_interval_usec": sample_interval_usec,
            "duration_sec": duration_sec,
            "sample_rate": sample_rate,
            "error_rate": error_rate
        }

    def export(self, filename):
        flamegraph_data = self._convert_to_flamegraph_format()

        # Debug output
        num_functions = len(flamegraph_data.get("children", []))
        total_time = flamegraph_data.get("value", 0)
        print(
            f"Flamegraph data: {num_functions} root functions, total samples: {total_time}"
        )

        if num_functions == 0:
            print(
                "Warning: No functions found in profiling data. Check if sampling captured any data."
            )
            return

        html_content = self._create_flamegraph_html(flamegraph_data)

        with open(filename, "w", encoding="utf-8") as f:
            f.write(html_content)

        print(f"Flamegraph saved to: {filename}")

    @staticmethod
    @functools.lru_cache(maxsize=None)
    def _format_function_name(func):
        filename, lineno, funcname = func

        if len(filename) > 50:
            parts = filename.split("/")
            if len(parts) > 2:
                filename = f".../{'/'.join(parts[-2:])}"

        return f"{funcname} ({filename}:{lineno})"

    def _convert_to_flamegraph_format(self):
        """Convert call trees to d3-flamegraph format with optimized hierarchy building"""
        if not self.call_trees:
            return {"name": "No Data", "value": 0, "children": []}

        unique_functions = set()
        for call_tree in self.call_trees:
            unique_functions.update(call_tree)

        func_to_name = {
            func: self._format_function_name(func) for func in unique_functions
        }

        root = {"name": "root", "children": {}, "samples": 0}

        for call_tree in self.call_trees:
            current_node = root
            current_node["samples"] += 1

            for func in call_tree:
                func_name = func_to_name[func]  # Use pre-computed name

                if func_name not in current_node["children"]:
                    current_node["children"][func_name] = {
                        "name": func_name,
                        "func": func,
                        "children": {},
                        "samples": 0,
                        "filename": func[0],
                        "lineno": func[1],
                        "funcname": func[2],
                    }

                current_node = current_node["children"][func_name]
                current_node["samples"] += 1

        def convert_node(node, min_samples=1):
            if node["samples"] < min_samples:
                return None

            source_code = None
            if "func" in node:
                source_code = self._get_source_lines(node["func"])

            result = {
                "name": node["name"],
                "value": node["samples"],
                "children": [],
            }

            if "filename" in node:
                result.update(
                    {
                        "filename": node["filename"],
                        "lineno": node["lineno"],
                        "funcname": node["funcname"],
                    }
                )

            if source_code:
                result["source"] = source_code

            # Recursively convert children
            child_nodes = []
            for child_name, child_node in node["children"].items():
                child_result = convert_node(child_node, min_samples)
                if child_result:
                    child_nodes.append(child_result)

            # Sort children by sample count (descending)
            child_nodes.sort(key=lambda x: x["value"], reverse=True)
            result["children"] = child_nodes

            return result

        # Filter out very small functions (less than 0.1% of total samples)
        total_samples = len(self.call_trees)
        min_samples = max(1, int(total_samples * 0.001))

        converted_root = convert_node(root, min_samples)

        if not converted_root or not converted_root["children"]:
            return {"name": "No significant data", "value": 0, "children": []}

        # If we only have one root child, make it the root to avoid redundant level
        if len(converted_root["children"]) == 1:
            main_child = converted_root["children"][0]
            main_child["name"] = f"Program Root: {main_child['name']}"
            main_child["stats"] = self.stats
            return main_child

        converted_root["name"] = "Program Root"
        converted_root["stats"] = self.stats
        return converted_root

    def _get_source_lines(self, func):
        filename, lineno, funcname = func

        try:
            # Get several lines around the function definition
            lines = []
            start_line = max(1, lineno - 2)
            end_line = lineno + 3

            for line_num in range(start_line, end_line):
                line = linecache.getline(filename, line_num)
                if line.strip():
                    marker = "→ " if line_num == lineno else "  "
                    lines.append(f"{marker}{line_num}: {line.rstrip()}")

            return lines if lines else None

        except Exception:
            # If we can't get source code, return None
            return None

    def _create_flamegraph_html(self, data):
        data_json = json.dumps(data)

        template_dir = importlib.resources.files(__package__)
        vendor_dir = template_dir / "_vendor"
        assets_dir = template_dir / "_assets"

        d3_path = vendor_dir / "d3" / "7.8.5" / "d3.min.js"
        d3_flame_graph_dir = vendor_dir /  "d3-flame-graph" / "4.1.3"
        fg_css_path = d3_flame_graph_dir / "d3-flamegraph.css"
        fg_js_path = d3_flame_graph_dir / "d3-flamegraph.min.js"
        fg_tooltip_js_path = d3_flame_graph_dir / "d3-flamegraph-tooltip.min.js"

        html_template = (template_dir / "flamegraph_template.html").read_text(encoding="utf-8")
        css_content = (template_dir / "flamegraph.css").read_text(encoding="utf-8")
        js_content = (template_dir / "flamegraph.js").read_text(encoding="utf-8")

        # Inline first-party CSS/JS
        html_template = html_template.replace(
            "<!-- INLINE_CSS -->", f"<style>\n{css_content}\n</style>"
        )
        html_template = html_template.replace(
            "<!-- INLINE_JS -->", f"<script>\n{js_content}\n</script>"
        )

        png_path = assets_dir / "python-logo-only.png"
        b64_logo = base64.b64encode(png_path.read_bytes()).decode("ascii")

        # Let CSS control size; keep markup simple
        logo_html = f'<img src="data:image/png;base64,{b64_logo}" alt="Python logo"/>'
        html_template = html_template.replace("<!-- INLINE_LOGO -->", logo_html)

        d3_js = d3_path.read_text(encoding="utf-8")
        fg_css = fg_css_path.read_text(encoding="utf-8")
        fg_js = fg_js_path.read_text(encoding="utf-8")
        fg_tooltip_js = fg_tooltip_js_path.read_text(encoding="utf-8")

        html_template = html_template.replace(
            "<!-- INLINE_VENDOR_D3_JS -->",
            f"<script>\n{d3_js}\n</script>",
        )
        html_template = html_template.replace(
            "<!-- INLINE_VENDOR_FLAMEGRAPH_CSS -->",
            f"<style>\n{fg_css}\n</style>",
        )
        html_template = html_template.replace(
            "<!-- INLINE_VENDOR_FLAMEGRAPH_JS -->",
            f"<script>\n{fg_js}\n</script>",
        )
        html_template = html_template.replace(
            "<!-- INLINE_VENDOR_FLAMEGRAPH_TOOLTIP_JS -->",
            f"<script>\n{fg_tooltip_js}\n</script>",
        )

        # Replace the placeholder with actual data
        html_content = html_template.replace(
            "{{FLAMEGRAPH_DATA}}", data_json
        )

        return html_content
gh-135953: Add flamegraph reporter to sampling profiler (#138715) 2025-09-09 23:06:45 +01:00			`import base64`
gh-135953: Implement sampling tool under profile.sample (#135998) Implement a statistical sampling profiler that can profile external Python processes by PID. Uses the _remote_debugging module and converts the results to pstats-compatible format for analysis. Co-authored-by: Pablo Galindo <pablogsal@gmail.com> 2025-07-10 18:44:24 +01:00			`import collections`
gh-135953: Add flamegraph reporter to sampling profiler (#138715) 2025-09-09 23:06:45 +01:00			`import functools`
			`import importlib.resources`
			`import json`
			`import linecache`
gh-135953: Implement sampling tool under profile.sample (#135998) Implement a statistical sampling profiler that can profile external Python processes by PID. Uses the _remote_debugging module and converts the results to pstats-compatible format for analysis. Co-authored-by: Pablo Galindo <pablogsal@gmail.com> 2025-07-10 18:44:24 +01:00			`import os`

			`from .collector import Collector`


			`class StackTraceCollector(Collector):`
			`def __init__(self):`
			`self.call_trees = []`
			`self.function_samples = collections.defaultdict(int)`

gh-138385: Sample all interpreters in the tachyon profiler (#138398) 2025-09-09 00:41:08 +01:00			`def _process_frames(self, frames):`
			`"""Process a single thread's frame stack."""`
			`if not frames:`
			`return`

			`# Store the complete call stack (reverse order - root first)`
			`call_tree = list(reversed(frames))`
			`self.call_trees.append(call_tree)`

			`# Count samples per function`
			`for frame in frames:`
			`self.function_samples[frame] += 1`

gh-135953: Implement sampling tool under profile.sample (#135998) Implement a statistical sampling profiler that can profile external Python processes by PID. Uses the _remote_debugging module and converts the results to pstats-compatible format for analysis. Co-authored-by: Pablo Galindo <pablogsal@gmail.com> 2025-07-10 18:44:24 +01:00			`def collect(self, stack_frames):`
gh-138385: Sample all interpreters in the tachyon profiler (#138398) 2025-09-09 00:41:08 +01:00			`for frames in self._iter_all_frames(stack_frames):`
			`self._process_frames(frames)`
gh-135953: Implement sampling tool under profile.sample (#135998) Implement a statistical sampling profiler that can profile external Python processes by PID. Uses the _remote_debugging module and converts the results to pstats-compatible format for analysis. Co-authored-by: Pablo Galindo <pablogsal@gmail.com> 2025-07-10 18:44:24 +01:00

			`class CollapsedStackCollector(StackTraceCollector):`
			`def export(self, filename):`
			`stack_counter = collections.Counter()`
			`for call_tree in self.call_trees:`
			`# Call tree is already in root->leaf order`
			`stack_str = ";".join(`
			`f"{os.path.basename(f[0])}:{f[2]}:{f[1]}" for f in call_tree`
			`)`
			`stack_counter[stack_str] += 1`

			`with open(filename, "w") as f:`
			`for stack, count in stack_counter.items():`
			`f.write(f"{stack} {count}\n")`
			`print(f"Collapsed stack output written to {filename}")`
gh-135953: Add flamegraph reporter to sampling profiler (#138715) 2025-09-09 23:06:45 +01:00

			`class FlamegraphCollector(StackTraceCollector):`
			`def __init__(self):`
			`super().__init__()`
			`self.stats = {}`

			`def set_stats(self, sample_interval_usec, duration_sec, sample_rate, error_rate=None):`
			`"""Set profiling statistics to include in flamegraph data."""`
			`self.stats = {`
			`"sample_interval_usec": sample_interval_usec,`
			`"duration_sec": duration_sec,`
			`"sample_rate": sample_rate,`
			`"error_rate": error_rate`
			`}`

			`def export(self, filename):`
			`flamegraph_data = self._convert_to_flamegraph_format()`

			`# Debug output`
			`num_functions = len(flamegraph_data.get("children", []))`
			`total_time = flamegraph_data.get("value", 0)`
			`print(`
			`f"Flamegraph data: {num_functions} root functions, total samples: {total_time}"`
			`)`

			`if num_functions == 0:`
			`print(`
			`"Warning: No functions found in profiling data. Check if sampling captured any data."`
			`)`
			`return`

			`html_content = self._create_flamegraph_html(flamegraph_data)`

			`with open(filename, "w", encoding="utf-8") as f:`
			`f.write(html_content)`

			`print(f"Flamegraph saved to: {filename}")`

gh-135953: Fix refleak in cache method (#138721) 2025-09-10 01:08:09 +01:00			`@staticmethod`
gh-135953: Add flamegraph reporter to sampling profiler (#138715) 2025-09-09 23:06:45 +01:00			`@functools.lru_cache(maxsize=None)`
gh-135953: Fix refleak in cache method (#138721) 2025-09-10 01:08:09 +01:00			`def _format_function_name(func):`
gh-135953: Add flamegraph reporter to sampling profiler (#138715) 2025-09-09 23:06:45 +01:00			`filename, lineno, funcname = func`

			`if len(filename) > 50:`
			`parts = filename.split("/")`
			`if len(parts) > 2:`
			`filename = f".../{'/'.join(parts[-2:])}"`

			`return f"{funcname} ({filename}:{lineno})"`

			`def _convert_to_flamegraph_format(self):`
			`"""Convert call trees to d3-flamegraph format with optimized hierarchy building"""`
			`if not self.call_trees:`
			`return {"name": "No Data", "value": 0, "children": []}`

			`unique_functions = set()`
			`for call_tree in self.call_trees:`
			`unique_functions.update(call_tree)`

			`func_to_name = {`
			`func: self._format_function_name(func) for func in unique_functions`
			`}`

			`root = {"name": "root", "children": {}, "samples": 0}`

			`for call_tree in self.call_trees:`
			`current_node = root`
			`current_node["samples"] += 1`

			`for func in call_tree:`
			`func_name = func_to_name[func] # Use pre-computed name`

			`if func_name not in current_node["children"]:`
			`current_node["children"][func_name] = {`
			`"name": func_name,`
			`"func": func,`
			`"children": {},`
			`"samples": 0,`
			`"filename": func[0],`
			`"lineno": func[1],`
			`"funcname": func[2],`
			`}`

			`current_node = current_node["children"][func_name]`
			`current_node["samples"] += 1`

			`def convert_node(node, min_samples=1):`
			`if node["samples"] < min_samples:`
			`return None`

			`source_code = None`
			`if "func" in node:`
			`source_code = self._get_source_lines(node["func"])`

			`result = {`
			`"name": node["name"],`
			`"value": node["samples"],`
			`"children": [],`
			`}`

			`if "filename" in node:`
			`result.update(`
			`{`
			`"filename": node["filename"],`
			`"lineno": node["lineno"],`
			`"funcname": node["funcname"],`
			`}`
			`)`

			`if source_code:`
			`result["source"] = source_code`

			`# Recursively convert children`
			`child_nodes = []`
			`for child_name, child_node in node["children"].items():`
			`child_result = convert_node(child_node, min_samples)`
			`if child_result:`
			`child_nodes.append(child_result)`

			`# Sort children by sample count (descending)`
			`child_nodes.sort(key=lambda x: x["value"], reverse=True)`
			`result["children"] = child_nodes`

			`return result`

			`# Filter out very small functions (less than 0.1% of total samples)`
			`total_samples = len(self.call_trees)`
			`min_samples = max(1, int(total_samples * 0.001))`

			`converted_root = convert_node(root, min_samples)`

			`if not converted_root or not converted_root["children"]:`
			`return {"name": "No significant data", "value": 0, "children": []}`

			`# If we only have one root child, make it the root to avoid redundant level`
			`if len(converted_root["children"]) == 1:`
			`main_child = converted_root["children"][0]`
			`main_child["name"] = f"Program Root: {main_child['name']}"`
			`main_child["stats"] = self.stats`
			`return main_child`

			`converted_root["name"] = "Program Root"`
			`converted_root["stats"] = self.stats`
			`return converted_root`

			`def _get_source_lines(self, func):`
			`filename, lineno, funcname = func`

			`try:`
			`# Get several lines around the function definition`
			`lines = []`
			`start_line = max(1, lineno - 2)`
			`end_line = lineno + 3`

			`for line_num in range(start_line, end_line):`
			`line = linecache.getline(filename, line_num)`
			`if line.strip():`
			`marker = "→ " if line_num == lineno else " "`
			`lines.append(f"{marker}{line_num}: {line.rstrip()}")`

			`return lines if lines else None`

			`except Exception:`
			`# If we can't get source code, return None`
			`return None`

			`def _create_flamegraph_html(self, data):`
			`data_json = json.dumps(data)`

			`template_dir = importlib.resources.files(__package__)`
			`vendor_dir = template_dir / "_vendor"`
			`assets_dir = template_dir / "_assets"`

			`d3_path = vendor_dir / "d3" / "7.8.5" / "d3.min.js"`
			`d3_flame_graph_dir = vendor_dir / "d3-flame-graph" / "4.1.3"`
			`fg_css_path = d3_flame_graph_dir / "d3-flamegraph.css"`
			`fg_js_path = d3_flame_graph_dir / "d3-flamegraph.min.js"`
			`fg_tooltip_js_path = d3_flame_graph_dir / "d3-flamegraph-tooltip.min.js"`

			`html_template = (template_dir / "flamegraph_template.html").read_text(encoding="utf-8")`
			`css_content = (template_dir / "flamegraph.css").read_text(encoding="utf-8")`
			`js_content = (template_dir / "flamegraph.js").read_text(encoding="utf-8")`

			`# Inline first-party CSS/JS`
			`html_template = html_template.replace(`
			`"<!-- INLINE_CSS -->", f"<style>\n{css_content}\n</style>"`
			`)`
			`html_template = html_template.replace(`
			`"<!-- INLINE_JS -->", f"<script>\n{js_content}\n</script>"`
			`)`

			`png_path = assets_dir / "python-logo-only.png"`
			`b64_logo = base64.b64encode(png_path.read_bytes()).decode("ascii")`

			`# Let CSS control size; keep markup simple`
			`logo_html = f'<img src="data:image/png;base64,{b64_logo}" alt="Python logo"/>'`
			`html_template = html_template.replace("<!-- INLINE_LOGO -->", logo_html)`

			`d3_js = d3_path.read_text(encoding="utf-8")`
			`fg_css = fg_css_path.read_text(encoding="utf-8")`
			`fg_js = fg_js_path.read_text(encoding="utf-8")`
			`fg_tooltip_js = fg_tooltip_js_path.read_text(encoding="utf-8")`

			`html_template = html_template.replace(`
			`"<!-- INLINE_VENDOR_D3_JS -->",`
			`f"<script>\n{d3_js}\n</script>",`
			`)`
			`html_template = html_template.replace(`
			`"<!-- INLINE_VENDOR_FLAMEGRAPH_CSS -->",`
			`f"<style>\n{fg_css}\n</style>",`
			`)`
			`html_template = html_template.replace(`
			`"<!-- INLINE_VENDOR_FLAMEGRAPH_JS -->",`
			`f"<script>\n{fg_js}\n</script>",`
			`)`
			`html_template = html_template.replace(`
			`"<!-- INLINE_VENDOR_FLAMEGRAPH_TOOLTIP_JS -->",`
			`f"<script>\n{fg_tooltip_js}\n</script>",`
			`)`

			`# Replace the placeholder with actual data`
			`html_content = html_template.replace(`
			`"{{FLAMEGRAPH_DATA}}", data_json`
			`)`

			`return html_content`