From 86d55c41db7c9e0ac5c3931a636613aec457c9b4 Mon Sep 17 00:00:00 2001 From: andryyy Date: Mon, 23 Dec 2019 10:20:34 +0100 Subject: [PATCH] [Rspamd] Touch bad lang map [Rspamd] SA trivial converter (wip) --- data/Dockerfiles/rspamd/docker-entrypoint.sh | 1 + .../Dockerfiles/rspamd/sa_trivial_convert.lua | 460 ++++++++++++++++++ 2 files changed, 461 insertions(+) create mode 100644 data/Dockerfiles/rspamd/sa_trivial_convert.lua diff --git a/data/Dockerfiles/rspamd/docker-entrypoint.sh b/data/Dockerfiles/rspamd/docker-entrypoint.sh index 5cfc0c5a..8448cf94 100755 --- a/data/Dockerfiles/rspamd/docker-entrypoint.sh +++ b/data/Dockerfiles/rspamd/docker-entrypoint.sh @@ -50,6 +50,7 @@ touch /etc/rspamd/custom/global_mime_from_blacklist.map \ /etc/rspamd/custom/global_mime_from_whitelist.map \ /etc/rspamd/custom/global_rcpt_whitelist.map \ /etc/rspamd/custom/global_smtp_from_whitelist.map \ + /etc/rspamd/custom/bad_languages.map \ /etc/rspamd/custom/sa-rules \ /etc/rspamd/custom/dovecot_trusted.map \ /etc/rspamd/custom/ip_wl.map \ diff --git a/data/Dockerfiles/rspamd/sa_trivial_convert.lua b/data/Dockerfiles/rspamd/sa_trivial_convert.lua new file mode 100644 index 00000000..8cf0b913 --- /dev/null +++ b/data/Dockerfiles/rspamd/sa_trivial_convert.lua @@ -0,0 +1,460 @@ +local fun = require "fun" +local rspamd_logger = require "rspamd_logger" +local util = require "rspamd_util" +local lua_util = require "lua_util" +local rspamd_regexp = require "rspamd_regexp" +local ucl = require "ucl" + +local complicated = {} +local rules = {} +local scores = {} + +local function words_to_re(words, start) + return table.concat(fun.totable(fun.drop_n(start, words)), " "); +end + +local function split(str, delim) + local result = {} + + if not delim then + delim = '[^%s]+' + end + + for token in string.gmatch(str, delim) do + table.insert(result, token) + end + + return result +end + +local function handle_header_def(hline, cur_rule) + --Now check for modifiers inside header's name + local hdrs = split(hline, '[^|]+') + local hdr_params = {} + local cur_param = {} + -- Check if an re is an ordinary re + local ordinary = true + + for _,h in ipairs(hdrs) do + if h == 'ALL' or h == 'ALL:raw' then + ordinary = false + else + local args = split(h, '[^:]+') + cur_param['strong'] = false + cur_param['raw'] = false + cur_param['header'] = args[1] + + if args[2] then + -- We have some ops that are required for the header, so it's not ordinary + ordinary = false + end + + fun.each(function(func) + if func == 'addr' then + cur_param['function'] = function(str) + local addr_parsed = util.parse_addr(str) + local ret = {} + if addr_parsed then + for _,elt in ipairs(addr_parsed) do + if elt['addr'] then + table.insert(ret, elt['addr']) + end + end + end + + return ret + end + elseif func == 'name' then + cur_param['function'] = function(str) + local addr_parsed = util.parse_addr(str) + local ret = {} + if addr_parsed then + for _,elt in ipairs(addr_parsed) do + if elt['name'] then + table.insert(ret, elt['name']) + end + end + end + + return ret + end + elseif func == 'raw' then + cur_param['raw'] = true + elseif func == 'case' then + cur_param['strong'] = true + else + rspamd_logger.warnx(rspamd_config, 'Function %1 is not supported in %2', + func, cur_rule['symbol']) + end + end, fun.tail(args)) + + local function split_hdr_param(param, headers) + for _,hh in ipairs(headers) do + local nparam = {} + for k,v in pairs(param) do + if k ~= 'header' then + nparam[k] = v + end + end + + nparam['header'] = hh + table.insert(hdr_params, nparam) + end + end + -- Some header rules require splitting to check of multiple headers + if cur_param['header'] == 'MESSAGEID' then + -- Special case for spamassassin + ordinary = false + elseif cur_param['header'] == 'ToCc' then + ordinary = false + else + table.insert(hdr_params, cur_param) + end + end + + cur_rule['ordinary'] = ordinary + cur_rule['header'] = hdr_params + end +end + +local function process_sa_conf(f) + local cur_rule = {} + local valid_rule = false + + local function insert_cur_rule() + if not rules[cur_rule.type] then + rules[cur_rule.type] = {} + end + + local target = rules[cur_rule.type] + + if cur_rule.type == 'header' then + if not cur_rule.header[1].header then + rspamd_logger.errx(rspamd_config, 'bad rule definition: %1', cur_rule) + return + end + if not target[cur_rule.header[1].header] then + target[cur_rule.header[1].header] = {} + end + target = target[cur_rule.header[1].header] + end + + if not cur_rule['symbol'] then + rspamd_logger.errx(rspamd_config, 'bad rule definition: %1', cur_rule) + return + end + target[cur_rule['symbol']] = cur_rule + cur_rule = {} + valid_rule = false + end + + local function parse_score(words) + if #words == 3 then + -- score rule + lua_util.debugm(N, rspamd_config, 'found score for %1: %2', words[2], words[3]) + return tonumber(words[3]) + elseif #words == 6 then + -- score rule + -- we assume here that bayes and network are enabled and select + lua_util.debugm(N, rspamd_config, 'found score for %1: %2', words[2], words[6]) + return tonumber(words[6]) + else + rspamd_logger.errx(rspamd_config, 'invalid score for %1', words[2]) + end + + return 0 + end + + local skip_to_endif = false + local if_nested = 0 + for l in f:lines() do + (function () + l = lua_util.rspamd_str_trim(l) + -- Replace bla=~/re/ with bla =~ /re/ (#2372) + l = l:gsub('([^%s])%s*([=!]~)%s*([^%s])', '%1 %2 %3') + + if string.len(l) == 0 or string.sub(l, 1, 1) == '#' then + return + end + + -- Unbalanced if/endif + if if_nested < 0 then if_nested = 0 end + if skip_to_endif then + if string.match(l, '^endif') then + if_nested = if_nested - 1 + + if if_nested == 0 then + skip_to_endif = false + end + elseif string.match(l, '^if') then + if_nested = if_nested + 1 + elseif string.match(l, '^else') then + -- Else counterpart for if + skip_to_endif = false + end + table.insert(complicated, l) + return + else + if string.match(l, '^ifplugin') then + local ls = split(l) + + skip_to_endif = true + if_nested = if_nested + 1 + table.insert(complicated, l) + elseif string.match(l, '^if !plugin%(') then + local pname = string.match(l, '^if !plugin%(([A-Za-z:]+)%)') + skip_to_endif = true + if_nested = if_nested + 1 + table.insert(complicated, l) + elseif string.match(l, '^if') then + -- Unknown if + skip_to_endif = true + if_nested = if_nested + 1 + table.insert(complicated, l) + elseif string.match(l, '^else') then + -- Else counterpart for if + skip_to_endif = true + table.insert(complicated, l) + elseif string.match(l, '^endif') then + if_nested = if_nested - 1 + table.insert(complicated, l) + end + end + + -- Skip comments + local words = fun.totable(fun.take_while( + function(w) return string.sub(w, 1, 1) ~= '#' end, + fun.filter(function(w) + return w ~= "" end, + fun.iter(split(l))))) + + if words[1] == "header" then + -- header SYMBOL Header ~= /regexp/ + if valid_rule then + insert_cur_rule() + end + if words[4] and (words[4] == '=~' or words[4] == '!~') then + cur_rule['type'] = 'header' + cur_rule['symbol'] = words[2] + + if words[4] == '!~' then + table.insert(complicated, l) + return + end + + cur_rule['re_expr'] = words_to_re(words, 4) + local unset_comp = string.find(cur_rule['re_expr'], '%s+%[if%-unset:') + if unset_comp then + table.insert(complicated, l) + return + end + + cur_rule['re'] = rspamd_regexp.create(cur_rule['re_expr']) + + if not cur_rule['re'] then + rspamd_logger.warnx(rspamd_config, "Cannot parse regexp '%1' for %2", + cur_rule['re_expr'], cur_rule['symbol']) + table.insert(complicated, l) + return + else + handle_header_def(words[3], cur_rule) + if not cur_rule['ordinary'] then + table.insert(complicated, l) + return + end + end + + valid_rule = true + else + table.insert(complicated, l) + return + end + elseif words[1] == "body" then + -- body SYMBOL /regexp/ + if valid_rule then + insert_cur_rule() + end + + cur_rule['symbol'] = words[2] + if words[3] and (string.sub(words[3], 1, 1) == '/' + or string.sub(words[3], 1, 1) == 'm') then + cur_rule['type'] = 'sabody' + cur_rule['re_expr'] = words_to_re(words, 2) + cur_rule['re'] = rspamd_regexp.create(cur_rule['re_expr']) + if cur_rule['re'] then + + valid_rule = true + end + else + -- might be function + table.insert(complicated, l) + return + end + elseif words[1] == "rawbody" then + -- body SYMBOL /regexp/ + if valid_rule then + insert_cur_rule() + end + + cur_rule['symbol'] = words[2] + if words[3] and (string.sub(words[3], 1, 1) == '/' + or string.sub(words[3], 1, 1) == 'm') then + cur_rule['type'] = 'sarawbody' + cur_rule['re_expr'] = words_to_re(words, 2) + cur_rule['re'] = rspamd_regexp.create(cur_rule['re_expr']) + if cur_rule['re'] then + valid_rule = true + end + else + table.insert(complicated, l) + return + end + elseif words[1] == "full" then + -- body SYMBOL /regexp/ + if valid_rule then + insert_cur_rule() + end + + cur_rule['symbol'] = words[2] + + if words[3] and (string.sub(words[3], 1, 1) == '/' + or string.sub(words[3], 1, 1) == 'm') then + cur_rule['type'] = 'message' + cur_rule['re_expr'] = words_to_re(words, 2) + cur_rule['re'] = rspamd_regexp.create(cur_rule['re_expr']) + cur_rule['raw'] = true + if cur_rule['re'] then + valid_rule = true + end + else + table.insert(complicated, l) + return + end + elseif words[1] == "uri" then + -- uri SYMBOL /regexp/ + if valid_rule then + insert_cur_rule() + end + cur_rule['type'] = 'uri' + cur_rule['symbol'] = words[2] + cur_rule['re_expr'] = words_to_re(words, 2) + cur_rule['re'] = rspamd_regexp.create(cur_rule['re_expr']) + if cur_rule['re'] and cur_rule['symbol'] then + valid_rule = true + else + table.insert(complicated, l) + return + end + elseif words[1] == "meta" then + -- meta SYMBOL expression + if valid_rule then + insert_cur_rule() + end + table.insert(complicated, l) + return + elseif words[1] == "describe" and valid_rule then + cur_rule['description'] = words_to_re(words, 2) + elseif words[1] == "score" then + scores[words[2]] = parse_score(words) + else + table.insert(complicated, l) + return + end + end)() + end + if valid_rule then + insert_cur_rule() + end +end + +for _,matched in ipairs(arg) do + local f = io.open(matched, "r") + if f then + rspamd_logger.messagex(rspamd_config, 'loading SA rules from %s', matched) + process_sa_conf(f) + else + rspamd_logger.errx(rspamd_config, "cannot open %1", matched) + end +end + +local multimap_conf = {} + +local function handle_rule(what, syms, hdr) + local mtype + local filter + local fname + local sym = what:upper() + if what == 'sabody' then + mtype = 'content' + fname = 'body_re.map' + filter = 'oneline' + elseif what == 'sarawbody' then + fname = 'raw_body_re.map' + mtype = 'content' + filter = 'rawtext' + elseif what == 'full' then + fname = 'full_re.map' + mtype = 'content' + filter = 'full' + elseif what == 'uri' then + fname = 'uri_re.map' + mtype = 'url' + filter = 'full' + elseif what == 'header' then + fname = ('hdr_' .. hdr .. '_re.map'):lower() + mtype = 'header' + header = hdr + sym = sym .. '_' .. hdr:upper() + else + rspamd_logger.errx('unknown type: %s', what) + return + end + local conf = { + type = mtype, + filter = filter, + symbol = 'SA_MAP_AUTO_' .. sym, + regexp = true, + map = fname, + header = header, + symbols = {} + } + + local re_file = io.open(fname, 'w') + + for k,r in pairs(syms) do + local score = 0.0 + if scores[k] then + score = scores[k] + end + re_file:write(string.format('/%s/ %s:%f\n', tostring(r.re), k, score)) + table.insert(conf.symbols, k) + end + + re_file:close() + + multimap_conf[sym:lower()] = conf + rspamd_logger.messagex('stored %s regexp in %s', sym:lower(), fname) +end + +for k,v in pairs(rules) do + if k == 'header' then + for h,r in pairs(v) do + handle_rule(k, r, h) + end + else + handle_rule(k, v) + end +end + +local out = ucl.to_format(multimap_conf, 'ucl') +local mmap_conf = io.open('auto_multimap.conf', 'w') +mmap_conf:write(out) +mmap_conf:close() +rspamd_logger.messagex('stored multimap conf in %s', 'auto_multimap.conf') + +local sa_remain = io.open('auto_sa.conf', 'w') +fun.each(function(l) + sa_remain:write(l) +end, fun.filter(function(l) return not string.match(l, '^%s+$') end, complicated)) +sa_remain:close() +rspamd_logger.messagex('stored sa remains conf in %s', 'auto_sa.conf')