Source code for app.alba_core.extraction

"""Requirement extraction for Alba Core.

This module owns the first step of the CRM/matching flow: turning a renter's
plain-English message into a :class:`RentalRequirements` object.

Inputs:
    * the latest customer message;
    * optional existing requirements from an earlier conversation turn;
    * known locations gathered from the real PropertyMe-backed cache.

Outputs:
    * structured requirements;
    * extraction confidence;
    * short notes explaining what was detected.

Do not add property ranking or lead-writing logic here. Extraction should only
describe what the renter asked for.
"""

from __future__ import annotations

import re
from dataclasses import dataclass
from typing import Iterable

from app.alba_core.models import RentalRequirements

NUMBER_WORDS = {
    "one": 1,
    "two": 2,
    "three": 3,
    "four": 4,
    "five": 5,
    "six": 6,
    "seven": 7,
    "eight": 8,
    "nine": 9,
    "ten": 10,
}

PROPERTY_TYPE_WORDS = {
    "house": "house",
    "home": "house",
    "apartment": "apartment",
    "apt": "apartment",
    "flat": "apartment",
    "unit": "apartment",
    "townhouse": "townhouse",
    "town house": "townhouse",
    "townhome": "townhouse",
}

FEATURE_WORDS = {
    "pool": "pool",
    "spa": "spa",
    "view": "view",
    "sea view": "sea view",
    "waterfront": "waterfront",
    "garden": "garden",
    "gym": "gym",
}

NO_PREFERENCE_PATTERNS = [
    "no preference",
    "not a concern",
    "isn't a concern",
    "isnt a concern",
    "dont care",
    "don't care",
    "flexible",
    "nothing",
]


[docs] @dataclass(slots=True) class ExtractionOutput: """Result returned by :class:`RequirementExtractor`. The output keeps requirements and explanation notes together so demos, tests, and future channel wrappers can show what Alba Core understood. """ requirements: RentalRequirements confidence: float notes: list[str]
[docs] def to_dict(self) -> dict[str, object]: """Return the API-friendly extraction response.""" return { "requirements": self.requirements.to_dict(), "confidence": round(self.confidence, 2), "notes": self.notes, "missing_required_fields": self.requirements.missing_required_fields, }
[docs] class RequirementExtractor: """Deterministic extraction for common renter wording. AI can be added later as a helper, but this class keeps the baseline repeatable and easy to test. """ def __init__(self, known_locations: Iterable[str]) -> None: """Create an extractor grounded in locations Alba Core can serve. Locations are sorted longest-first so names like "St Marys Bay" are matched before shorter overlapping terms. """ self.known_locations = sorted( {location.casefold() for location in known_locations if location}, key=len, reverse=True, )
[docs] def extract( self, message: str, current: RentalRequirements | None = None, ) -> ExtractionOutput: """Extract requirements from a single message and merge prior state. This method is intentionally a readable sequence of small extraction steps. When adding a new field, add the helper below and then call it in this flow so new contributors can see the full order at a glance. """ current = current or RentalRequirements() lower = message.casefold() patch = RentalRequirements() notes: list[str] = [] city_or_suburb = self._extract_location(lower) if city_or_suburb: patch.city = city_or_suburb.title() notes.append(f"location detected: {patch.city}") budget = self._extract_budget(lower) if budget is not None: patch.budget_max = budget notes.append(f"budget detected: {budget:.0f}") bedrooms = self._extract_count(lower, ["bed", "beds", "bedroom", "bedrooms", "br"]) if bedrooms is not None: patch.bedrooms_min = bedrooms notes.append(f"bedrooms detected: {bedrooms}+") bathrooms = self._extract_count(lower, ["bath", "baths", "bathroom", "bathrooms"]) if bathrooms is not None: patch.bathrooms_min = bathrooms notes.append(f"bathrooms detected: {bathrooms}+") parking = self._extract_parking(lower) if parking is not None: patch.parking_min = parking notes.append("parking is not required" if parking == 0 else f"parking detected: {parking}+") property_type = self._extract_property_type(lower) if property_type: patch.property_type = property_type notes.append(f"property type detected: {property_type}") move_in = self._extract_move_in(lower) if move_in: patch.move_in_timing = move_in notes.append(f"move-in timing detected: {move_in}") priority = self._extract_priority(lower) if priority: patch.priority = priority notes.append(f"priority detected: {priority}") patch.features = self._extract_features(lower) if patch.features: notes.append(f"features detected: {', '.join(patch.features)}") self._extract_boolean_preferences(lower, patch) patch.no_preference = self._extract_no_preference(lower) merged = current.merge(patch) confidence = self._confidence_for(merged, notes) return ExtractionOutput(requirements=merged, confidence=confidence, notes=notes)
def _extract_location(self, lower: str) -> str | None: """Return the first known cache-backed location mentioned by the user.""" for location in self.known_locations: if re.search(rf"\b{re.escape(location)}\b", lower): return location return None def _extract_budget(self, lower: str) -> float | None: """Extract a weekly budget without confusing counts for money. Renter messages often contain bare numbers for bedrooms, bathrooms, and budget. The regex order below deliberately requires a money marker, budget wording, or a phrase such as "under 5000" before treating a number as rent. """ word_match = re.search(r"\b(one|two|three|four|five|six|seven|eight|nine|ten)\s+(?:grand|k)\b", lower) if word_match: return float(NUMBER_WORDS[word_match.group(1)] * 1000) # Avoid treating counts such as "4 bedrooms" as money. A number becomes # a budget only when it has a money marker or appears beside budget/rent wording. money_match = re.search(r"(?:\$|nzd\s*)(\d+(?:,\d{3})?|\d+(?:\.\d+)?)\s*(k|grand)?\b", lower) if not money_match: money_match = re.search(r"\b(\d+(?:,\d{3})?|\d+(?:\.\d+)?)\s*(k|grand)\b", lower) if not money_match: money_match = re.search( r"\b(?:budget|rent|weekly|per week|pw)\D{0,20}(\d+(?:,\d{3})?|\d+(?:\.\d+)?)\b", lower, ) if not money_match: money_match = re.search( r"\b(\d+(?:,\d{3})?|\d+(?:\.\d+)?)\D{0,20}(?:budget|rent|weekly|per week|pw)\b", lower, ) if not money_match: money_match = re.search( r"\b(?:under|up to|max|maximum|less than|below)\D{0,12}(\d+(?:,\d{3})?|\d+(?:\.\d+)?)\b", lower, ) if not money_match: return None amount = float(money_match.group(1).replace(",", "")) suffix = money_match.group(2) if len(money_match.groups()) > 1 else None if suffix in {"k", "grand"} or amount < 100: amount *= 1000 if amount < 150: return None return amount def _extract_count(self, lower: str, labels: list[str]) -> int | None: """Extract a numeric or word count next to labels such as bedrooms.""" label_pattern = "|".join(re.escape(label) for label in labels) digit_match = re.search(rf"\b(\d+)\s*(?:or\s+more\s+)?(?:{label_pattern})\b", lower) if digit_match: return int(digit_match.group(1)) word_pattern = "|".join(NUMBER_WORDS) word_match = re.search(rf"\b({word_pattern})\s*(?:or\s+more\s+)?(?:{label_pattern})\b", lower) if word_match: return NUMBER_WORDS[word_match.group(1)] return None def _extract_parking(self, lower: str) -> int | None: """Extract parking need, including explicit flexible/no-parking wording.""" if "parking" not in lower and "garage" not in lower and "car space" not in lower: return None if any(pattern in lower for pattern in NO_PREFERENCE_PATTERNS): return 0 count = self._extract_count(lower, ["parking", "garage", "garages", "car space", "car spaces"]) return count if count is not None else 1 def _extract_property_type(self, lower: str) -> str | None: """Normalise common property type synonyms to one internal value.""" for word, property_type in PROPERTY_TYPE_WORDS.items(): if re.search(rf"\b{re.escape(word)}\b", lower): return property_type return None def _extract_move_in(self, lower: str) -> str | None: """Extract simple move-in timing phrases used by matching and summary.""" if re.search(r"\b(asap|immediately|right away|now)\b", lower): return "asap" if re.search(r"\bnext month\b", lower): return "next month" if re.search(r"\bin\s+\d+\s+weeks?\b", lower): return re.search(r"\bin\s+\d+\s+weeks?\b", lower).group(0) return None def _extract_priority(self, lower: str) -> str | None: """Extract the user's stated priority only when the message asks for one.""" priorities = { "location": ["location", "area", "suburb", "nice spot"], "price": ["price", "budget", "cheap", "affordable"], "space": ["space", "bedrooms", "large", "room"], "move-in timing": ["move in", "asap", "available"], "lifestyle extras": ["pool", "view", "waterfront", "garden", "spa"], } if "matters most" in lower or "priority" in lower or "important" in lower: for priority, words in priorities.items(): if any(word in lower for word in words): return priority return None def _extract_features(self, lower: str) -> list[str]: """Return lifestyle features that can later affect soft scoring.""" return [feature for word, feature in FEATURE_WORDS.items() if word in lower] def _extract_boolean_preferences(self, lower: str, patch: RentalRequirements) -> None: """Patch simple yes/no preferences such as pets and furnishing.""" if "pet friendly" in lower or "pets allowed" in lower or "with pets" in lower: patch.pets_required = True elif "no pets" in lower or "without pets" in lower: patch.pets_required = False if "furnished" in lower and "unfurnished" not in lower: patch.furnished_required = True elif "unfurnished" in lower or "not furnished" in lower: patch.furnished_required = False def _extract_no_preference(self, lower: str) -> list[str]: """Record fields the renter explicitly says are flexible.""" if not any(pattern in lower for pattern in NO_PREFERENCE_PATTERNS): return [] fields: list[str] = [] if "parking" in lower: fields.append("parking") if "bath" in lower: fields.append("bathrooms") if "must" in lower or "feature" in lower or "extra" in lower: fields.append("features") if not fields: fields.append("unspecified") return fields def _confidence_for(self, requirements: RentalRequirements, notes: list[str]) -> float: """Estimate confidence from detected fields and search readiness.""" score = 0.35 score += min(len(notes) * 0.08, 0.35) if requirements.is_search_ready: score += 0.25 return min(score, 0.95)